In [18]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
from mpltools import style
style.use('ggplot')

<h2>Fisher's Exact Test</h2>
<p><a href="http://www.biostathandbook.com/fishers.html">Fisher's Exact Test</a> is used when you have two variables, and you hope to determine if there is a statistical basis for membership in one class prefferring another.</p>

In [3]:
# first 'manually compute'
steps = [stats.hypergeom.pmf(x, 35, 15, 19) for x in range(5)]
print(sum(steps))

# now use the built in function.  n.b. the default 'alternative' is a two tailed
# test
oddsratio, pvalue = stats.fisher_exact([[4,15],[11, 5]], alternative='less')
print(pvalue)

0.00578581676903
0.00578581676903


<p>The above is discussed in greater detail in the slides.</p>
</br> </br>
<h2>Pearsons $\chi^2$ Test</h2>

In [4]:
X = np.array([27, 15, 30])
Y = np.array([18, 15, 8])
chi2, p, ddof, expected = stats.chi2_contingency([X, Y])

print("Observed:\n{}\n{}\n".format(X, Y))
print("Expected (if independent):\n", expected,"\n")

msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}"
print(msg.format( chi2, p, ddof ) )

Observed:
[27 15 30]
[18 15  8]

Expected (if independent):
 [[ 28.67256637  19.11504425  24.21238938]
 [ 16.32743363  10.88495575  13.78761062]] 

Test Statistic: 6.523368635002141
p-value: 0.038323794188885686
Degrees of Freedom: 2


<p>So we perform a cell by cell comparison to an expected value.  How do we get these expected values?</p>

In [5]:
#get expected values
total_N = sum(X)+sum(Y)
ave = np.array([(x[0]+x[1])/(total_N) for x in zip(X, Y)])
X_exp = sum(X) * ave
Y_exp = sum(Y) * ave
print(X_exp, Y_exp)

#get test statistic
all_obs = np.append(X, Y)
all_exp = np.append(X_exp, Y_exp)
test_stat = sum([pow(x[0]-x[1], 2)/x[1] for x in zip(all_obs, all_exp)])
print(test_stat)

#get p-value based on test-statistic and ndf
print(stats.chi2.sf(test_stat, 2))

[ 28.67256637  19.11504425  24.21238938] [ 16.32743363  10.88495575  13.78761062]
6.523368635
0.0383237941889


<h2>KS Test</h2>
<p>
KS tests compare the maximum deviation between the CDF and the a particular model.
</p>

In [33]:
data = np.random.standard_t(3, size=1000)
#try to fit a standard normal to t distributed data
D, p_value = stats.kstest(data, 'norm')
print("KS test statistic for a normal fit: {}, p-Value: {}".format(D, p_value))

#try to fit a uniform to t-distributed data
D, p_value = stats.kstest(data, 'uniform')
print("KS test statistic for a uniform fit: {}, p-Value: {}".format(D, p_value))

KS test statistic for a normal fit: 0.06068261129898087, p-Value: 0.0012107591800181172
KS test statistic for a uniform fit: 0.506555231555142, p-Value: 0.0
