### BIOS470/570 Lecture 24 Statistics with python

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats

import warnings
warnings.simplefilter(action="ignore",category=FutureWarning)

### Statistical testing is a means of determining whether data support a particular hypothesis typically when compared to a null hypothesis. A common example is whether two groups of data are different or the same. If I treat one set of samples with a chemical and another with a control does my outcome differ between the groups in a statistically signficant way. 

### A t-test is used to determine whether the means of 2 samples are statistically different (i.e. whether the samples were drawn from distributions with different means). 

### Create two normal distributions same mean, different variance:

In [None]:
rand1 = np.random.normal(0,3,100000)
rand2 = np.random.normal(0,1,100000)
dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})

### Plot the distributions using the matplotlib hist function:

In [None]:
bins = np.arange(-10,10,0.1)
plt.hist(dists.rand1,bins,label = 'mean 0, sigma 3')
plt.hist(dists.rand2,bins,alpha=0.6,label = 'mean 0, sigma 1');
plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency');

### Seaborn also has good tools for visualizing this data:

In [None]:
sns.boxplot(dists);

In [None]:
sns.violinplot(dists);

### A t-test is used to determine whether the means of 2 samples are statistically different (i.e. whether the samples were drawn from distributions with different means). 

In [None]:
test_result = stats.ttest_ind(dists.rand1,dists.rand2)
test_result

In [None]:
test_result.pvalue

### What happens if we have two normal distributions with different means:

In [None]:
rand1 = np.random.normal(1,1,100)
rand2 = np.random.normal(0,1,100)
dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})

In [None]:
sns.violinplot(dists);

In [None]:
sns.boxplot(dists);

In [None]:
test_result = stats.ttest_ind(dists.rand1,dists.rand2)
test_result

#### Even though these distributions have substantial overlap, the difference in the means is highly significant. 

### What if we measure fewer samples?

In [None]:
rand1 = np.random.normal(1,1,5)
rand2 = np.random.normal(0,1,5)
dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})

In [None]:
test_result = stats.ttest_ind(dists.rand1,dists.rand2)
test_result

### Let's run this many times and compute the p-values each time:

In [None]:
pvalues = []
for n in range(1000):
    rand1 = np.random.normal(1,1,5)
    rand2 = np.random.normal(0,1,5)
    dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})
    test_result = stats.ttest_ind(dists.rand1,dists.rand2)
    pvalues.append(test_result.pvalue)

In [None]:
plt.hist(pvalues,20);

### How many of these trials are "statisically signficant"?

In [None]:
(np.array(pvalues) < 0.05).sum()

### Reduce the variance in the distributions:

In [None]:
pvalues = []
for n in range(1000):
    rand1 = np.random.normal(1,0.3,5)
    rand2 = np.random.normal(0,0.3,5)
    dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})
    test_result = stats.ttest_ind(dists.rand1,dists.rand2)
    pvalues.append(test_result.pvalue)

In [None]:
plt.hist(pvalues,20);

In [None]:
(np.array(pvalues) < 0.05).sum()

In [None]:
### Keep the large variance, but separate the means further:

In [None]:
pvalues = []
for n in range(1000):
    rand1 = np.random.normal(5,1,5)
    rand2 = np.random.normal(0,1,5)
    dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})
    test_result = stats.ttest_ind(dists.rand1,dists.rand2)
    pvalues.append(test_result.pvalue)

In [None]:
plt.hist(pvalues,20);

In [None]:
(np.array(pvalues) < 0.05).sum()

### We see that the number of samples we need to measure to get a significant effect depends on the size of the effect we are measuring (i.e. the difference between the means) and the variability of our measurements (the standard deviation of the distributions). 

### Recall our original case, many samples, same mean, different variances. What if we want to determine if the distributions are different (not just the means)?

In [None]:
rand1 = np.random.normal(0,3,100000)
rand2 = np.random.normal(0,1,100000)
dists = pd.DataFrame({"rand1":rand1,"rand2":rand2})

In [None]:
bins = np.arange(-10,10,0.1)
plt.hist(dists.rand1,bins,label = 'mean 0, sigma 3')
plt.hist(dists.rand2,bins,alpha=0.6,label = 'mean 0, sigma 1');
plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency');

### Another way to look at this is the cumulative distribution, which plots the fraction of the distribution that is above a particular value:

In [None]:
plt.hist(dists.rand1,bins,density = True, cumulative=True,label = 'mean 0, sigma 3')
plt.hist(dists.rand2,bins,density=True,cumulative=True, alpha=0.4,label = 'mean 0, sigma 1');
plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency');

### The Kolmogorov-Smirnov KS test uses the distance between two cumulative distributions to determine whether the distributions are different:

In [None]:
stats.ks_2samp(dists.rand1,dists.rand2)

In [None]:
stats.ttest_ind(dists.rand1,dists.rand2)

### So the difference between these distributions is highly signficant even though the means are the same. 

### Let's look at these tests applied to a more realistic dataset, seaborn has a number of example datasets that can be loaded with load_dataset. 

In [None]:
dat = sns.load_dataset('iris')

In [None]:
dat

In [None]:
dat.species.unique()

In [None]:
sns.boxplot(dat);

In [None]:
sns.boxplot(dat,x='species',y = 'sepal_width');

In [None]:
sns.violinplot(dat,x = 'species', y = 'sepal_length');

### Are these difference in sepal width signicant between species?

In [None]:
species = ['setosa','versicolor','virginica']
swidth = pd.DataFrame()
for ii in range(3):
    swidth[species[ii]] = dat.loc[dat.species==species[ii],"sepal_width"].reset_index(drop=True)
    

In [None]:
swidth

In [None]:
stats.ttest_ind(swidth.versicolor,swidth.virginica)

In [None]:
stats.ttest_ind(swidth.setosa,swidth.virginica)

In [None]:
species = ['setosa','versicolor','virginica']
slength = pd.DataFrame()
for ii in range(3):
    slength[species[ii]] = dat.loc[dat.species==species[ii],"sepal_length"].reset_index(drop=True)
stats.ttest_ind(slength.versicolor,slength.virginica)

In [None]:
stats.ks_2samp(slength.versicolor,slength.virginica)