7. Hypothesis Testing: Introduction to Hypothesis Testing (p-value Approach)

Computing the Test Statistic

In [220]:
import numpy as np

In [246]:
def z_statistic(u0, n, X=0, std=1, prop=False):
    """
    The Z-statistic for a Z-test for a population
    u is obtained by transforming the sample mean
    X into a Z-score.
    
    Parameters
    ------------------------
    u0  : hypothesized mean
    n   : number of samples
    X   : sample mean
    std : standard deviation of population
    prop: allows calculating z for proportions
    """
    if prop:
        return ((X - u0) / np.sqrt(u0 * (1 - u0) / n))
    else:
        return ((X - u0) / (std/np.sqrt(n)))

In [247]:
std = 3.1
u0  = 13
n   = 57
X   = 13
s   = 3.0

z_statistic(u0,n,X,std)

-4.497658079466679

Calculating p and making a decision

In [232]:
from scipy.stats import norm

In [233]:
def p_value(z, tailed):
    """
    Calculate p-value given a direction 'tailed'.
    
    Parameters
    ---------------
    z: z-value
    tailed: two, left, right
    """
    if tailed.lower() == 'two':
        p = 2 * (1 - norm.cdf(abs(z)))
    elif tailed.lower() == 'left':
        p = norm.cdf(z)
    elif tailed.lower() == 'right':
        p = 1 - norm.cdf(z)
    return p


In [245]:
std   = 1.6
n     = 235
H0    = 14.5
alpha = 0.03
X     = 14.4

tailed = 'left'

Z = z_statistic(H0, n, X, std)
p_value(Z, tailed)

0.16900443899979922

Connecting hypothesis testing and confidence interval

In [210]:
def critical_value(ci):
    """
    Calculates the critical value z*
    
    Parameters
    ----------
    ci: Confidence interval in %
    """
    return norm.ppf((100+ci)/200,loc=0,scale=1)

def confidence_interval(n, std, x, ci):
    """
    Returns lower and upper bounds of a confidence interval
    
    Parameters
    ----------
    n  : sample size
    std: standard deviation of the population
    x  : mean of the sample
    ci : Confidence interval in %
    """
    error = critical_value(ci) * (std / np.sqrt(n))
    return x-error, x+error

In [216]:
n   = 120 #Number of icelanders surveiyed
x   = 103 #Avg IQ Icelanders
std = 18
ci  = 96 

lower, upper = confidence_interval(n,std,x,ci)
print ('CI = ({:6.3f},{:6.3f})'.format(lower, upper))

CI = (99.625,106.375)


In [217]:
n   = 137 #Number of icelanders surveiyed
x   = 94 #Avg IQ Icelanders
std = 18
ci  = 96 

lower, upper = confidence_interval(n,std,x,ci)
print ('CI = ({:6.3f},{:6.3f})'.format(lower, upper))

CI = (90.842,97.158)


In [229]:
u0 = 0.85    #Claim by politician
n  = 125     #Number of citizens surveiyed
a  = 0.04    #Level of significance
p  = 115 / n #Proportion that agrees
tail = 'two' #H0 = u0

z = z_statistic(u0, n, p, prop=True)
p_value(z, tail)

0.028395033359778

Small-sample proportion test: test statistic and p-value

In [167]:
from scipy.stats import binom

def p_value_nsmall(X, n, pi0, tailed):
    """
    Calculate p-value given a direction 'tailed'
    for a small sample.
    
    Parameters
    ---------------
    X:   test statistic
    n:   number of observations
    pi0: hypothesis
    tailed: two, left, right
    """
    if tailed.lower() == 'two':
        p = 2 * min((binom.cdf(X, n, pi0)),(1 - binom.cdf(X-1, n, pi0)))
    elif tailed.lower() == 'left':
        p = binom.cdf(X, n, pi0)
    elif tailed.lower() == 'right':
        p = 1 - binom.cdf(X-1, n, pi0)
    return p

In [168]:
pi0 = 0.20  #Claim by cassino
n  = 29     #Number of plays
X  = 2      #Number of wins
a  = 0.08   #Level of significance
tail = 'right' #H0 < u0

p_value_nsmall(X,n,pi0,tail)

0.9872337433448695

In [169]:
pi0 = 0.25  #Claim by cassino
n  = 23     #Number of plays
X  = 2      #Number of wins
a  = 0.09   #Level of significance
tail = 'left' #H0 < u0

p_value_nsmall(X,n,pi0,tail)

0.04920333524002274

In [170]:
pi0 = 0.5  #Fair coin
n  = 22    #Number of plays
a  = 0.10  #Level of significance
tail = 'right' #H0 > u0

binom.ppf(1-a,n,pi0)

14.0

In [171]:
pi0 = 0.25  #Fair coin
n  = 17     #Number of plays
a  = 0.10   #Level of significance
tail = 'right' #H0 > u0

binom.ppf(1-a,n,pi0)

7.0

One-sample t-test: Test statistic and p-value

In [172]:
from scipy.stats import t

def t_statistic(u0, n, X=0, s=1):
    """
    The t-statistic for a t-test.
    \[ t = \frac{\hat{X} - u_0}{s/\sqrt{n}} \] 
    
    Parameters
    ------------------------
    u0  : hypothesized mean
    n   : number of samples
    X   : sample mean
    s   : standard deviation of the sample
    """
    return ((X - u0) / (s / np.sqrt(n)))

def p_value_ttest(ts, n, tailed):
    """
    Calculate p-value given a direction 'tailed'.
    
    Parameters
    ---------------
    ts: t-value
    n : number of samples (df is calculated within the code)
    tailed: two, left, right
    """
    df = n - 1
    if tailed.lower() == 'two':
        p = 2 * (t.cdf(ts, df))
    elif tailed.lower() == 'left':
        p = t.cdf(ts, df)
    elif tailed.lower() == 'right':
        p = 1 - t.cdf(ts, df)
    return p

In [203]:
n  = 55    #Cars analyzed
u0 = 470   #Current range (km)
X  = 467.1 #Observed range new batteries (km)
s  = 19.1  #Standard error sample 
tail = 'two' #H0 = u0

a = 0.01

ts = t_statistic(u0, n, X, s)
p = p_value_ttest(ts, n, tail)

print('p = {:>6.4f}'.format(p))
if p < a:
    print('On the basis of this p-value, H0 should be rejected, \
because p < a')
else:
    print('On the basis of this p-value, H0 should not be rejected, \
because p > a')

p = 0.2651
On the basis of this p-value, H0 should not be rejected, because p > a


In [206]:
n  = 31    
u0 = 36   
X  = 38 
s  = 4.6
tail = 'right' 

a = 0.1

ts = t_statistic(u0, n, X, s)
p = p_value_ttest(ts, n, tail)

print('p = {:>6.4f}'.format(p))
if p < a:
    print('On the basis of this p-value, H0 should be rejected, \
because p < a')
else:
    print('On the basis of this p-value, H0 should not be rejected, \
because p > a')

p = 0.0109
On the basis of this p-value, H0 should be rejected, because p < a
