In [None]:
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import libraries 

import pandas as pd

# this is here to shut off some annoying warnings from pandas
pd.options.mode.chained_assignment = None

# matplotlib is one of the main plotting libraries we're going to use
import matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline

# the other plotting library is seaborn - we'll use both during the class

# import seaborn as sns

# numpy and scipy are for handling numerical and scientific data

import numpy as np
import scipy as sp

#import statsmodels.formula.api as smf # basic statistical modeling

from scipy.stats.stats import pearsonr 

import os

from scipy import stats
from scipy.stats import ks_2samp
from scipy.stats import entropy
from scipy.stats import norm
from scipy.stats import lognorm
from scipy.stats import nbinom
from scipy.stats.mstats import gmean

#from pandas.tools.plotting import scatter_matrix



In [None]:
# load dataframe with expression data

datafile = "data/barton/Barton_combined_Ygenes.txt"
df = pd.read_csv(datafile, sep='\t')   # the sep='\t' tells pandas that it is a tab separated file

df = df.set_index('Gene')

df.head()

In [None]:
# define some lists of columns

exps = {}
exps['wt'] = [c for c in df.columns if c.startswith('WT')]
exps['mut'] = [c for c in df.columns if c.startswith('Snf2')]

# it's possible to "add" two python lists to create a new list with the elements of both
exps['all'] = exps['wt'] + exps['mut']

In [None]:
# just to clean data up a tiny bit I'm going to get rid of all genes that are all 0's

df = df.dropna().loc[df[exps['all']].sum(axis=1) > 0]

In [None]:
df['ref'] = df[exps['all']].mean(axis=1)

In [None]:
for c in exps['all']:
    colnorm = np.median(df[c]/df['ref'])
    df[c] = df[c] / colnorm

# recalculate the reference

df['ref'] = df[exps['all']].mean(axis=1)

In [None]:
df

In [None]:
def plotdiff(df,c1,c2,ax):

    # the rest is just what we did before
    # except I don't create the figure
    
    x = df[c1].values
    y = df[c2].values

    ax.set_xscale('log')
    ax.set_yscale('log')

    ul = max(max(x),max(y)) * 2

    ax.set_xlim(1,ul)
    ax.set_ylim(1,ul)

    ax.set_xlabel("expression in " + c1)
    ax.set_ylabel("expression in " + c2)
    ax.set_title("Comparison of " + c1 + " " + c2)

    oedf = df[df[c2] > 2 * df[c1]]

    x = oedf[c1].values
    y = oedf[c2].values

    ax.scatter(x,y,s=1,c='red', label='2x up, n = %d' % len(oedf))

    uedf = df[df[c1] > 2 * df[c2]]

    x = uedf[c1].values
    y = uedf[c2].values

    ax.scatter(x,y,s=1,c='green', label = '2x down, n = %d' % len(uedf))

    ucdf = df[(df[c1].between(df[c2] * .5, df[c2] * 2.0))]

    x = ucdf[c1].values
    y = ucdf[c2].values

    # one last little tweek
    # i don't want the black spots to dominate visually
    # so i make them somewhat dimmer using an 
    # alpha value of < 1 which makes them somewhat transparent

    ax.scatter(x,y,s=1,c='black',alpha=.1)
    ax.legend()
    

In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,6))
plotdiff(df,'WT_rep01','ref',ax)

In [None]:
# Does this replicate?

wt1 = 'WT_rep01'
wt2 = 'WT_rep02'

mut1 = 'Snf2_rep01'
mut2 = 'Snf2_rep02'

cols = [wt1,wt2,mut1,mut2]

# create a temporary data frame where any row with values < 1 are removed

tdf = df[cols][df > 1].dropna()

x = np.log2(tdf[wt1]/tdf[mut1]).values
y = np.log2(tdf[wt2]/tdf[mut2]).values

c = []
s = []

for i,vx in enumerate(x):    
    if x[i] > 1.0: 
        if y[i] > 1.0:
            c.append('g')
            s.append(8)
        else:
            c.append('r')
            s.append(8)
    elif x[i] < -1.0:
        if y[i] < -1.0:
            c.append('g')
            s.append(8)
        else:
            c.append('r')
            s.append(8)
    else:
        c.append('k')
        s.append(1)

fig = plt.figure(figsize = (4,4))
ax = fig.add_subplot(1,1,1)
ax.set_xlim(-3,3)
ax.set_ylim(-3,3)
ax.scatter(x, y, s = s, c = c, edgecolor = 'none')
ax.set_xlabel("log(fold change) experiment 1")
ax.set_ylabel("log(fold change) experiment 2")
#ax.plot(x,x,'r-')

### Looking at individual genes

Now we are going to start looking at individual genes. The first thing we're doing to do is sort based on the magnitude of the ratio between the WT and Snf2 averages and then plot the individual values.

In [None]:

df['ave_wt'] = df[exps['wt']].mean(axis=1)
df['ave_mut'] = df[exps['mut']].mean(axis=1)
df['ave'] = df[exps['wt'] + exps['mut']].mean(axis=1)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,6))
plotdiff(df,'ave_wt','ave_mut',ax)

In [None]:
# Sort based ratio of snf2 ave vs wt ave

aved = {}

cols = ['ave_wt','ave_mut']

tdf = df[cols][df[cols].max(axis=1) > 10].dropna()

for index, row in tdf.iterrows():
    mut = row['ave_mut']
    wt = row['ave_wt']
    
    alograt2 = abs(np.log2(mut/wt))
    aved[index] = alograt2

aved = {k:v for (k,v) in aved.items() if v > 1.0}
aveds = sorted(aved.keys(),key = lambda x: aved[x], reverse=True)


mut_ind = []
mut = []
wt_ind = [] 
wt = []

for i,t in enumerate(aveds):
    for c in exps['mut']:
        mut_ind.append(i+1)
        mut.append(df.loc[t,c])
    for c in exps['wt']:
        wt_ind.append(i+1)
        wt.append(df.loc[t,c])
        
fig = plt.figure(figsize = (10,.1*max(wt_ind)))
ax = fig.add_subplot(1,1,1)
ax.set_xlim(1,max(wt + mut))
ax.set_ylim(max(wt_ind + mut_ind),0)
ax.set_xscale("log", nonposx='clip')
ax.set_xlabel("expression level")
ax.set_ylabel("gene - sorted by mut/wt")
ax.scatter(wt,wt_ind,s=12,alpha=.25,c='r',edgecolor='none')
ax.scatter(mut,mut_ind,s=12,alpha=.25,c='b',edgecolor='none')

In [None]:
# look at individual genes

gene = aveds[2]

mut = []
wt = []


for c in exps['mut']:
    mut.append(np.log10(df.at[gene,c]))
for c in exps['wt']:
    wt.append(np.log10(df.at[gene,c]))
        
h = plt.hist(wt, color = 'blue',label='WT')
h = plt.hist(mut, color = 'red',label='MUT')
plt.xlabel("log(expression)")
plt.legend()

In [None]:
# do the data fit a normal distribution and do we have enough data to do a fit

gene = aveds[0]

mut = []
wt = []

for c in exps['mut']:
    mut.append(df.at[gene,c])
for c in exps['wt']:
    wt.append(df.at[gene,c])

# Fit a normal distribution to the data:
wt_mu, wt_std = norm.fit(wt)
mut_mu, mut_std = norm.fit(mut)

ax = plt.subplot(2, 1, 1)
ax.hist(wt, density=True, bins=15,  alpha=0.6, color='red')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, wt_mu, wt_std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (wt_mu, wt_std)
ax.set_title(title)

ax = plt.subplot(2, 1, 2)
ax.hist(mut, bins=15, density=True, alpha=0.6, color='blue')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mut_mu, mut_std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mut_mu, mut_std)
ax.set_title(title)
plt.tight_layout()
plt.show()

In [None]:
# do the data fit a normal distribution and do we have enough data to do a fit

gene = aveds[0]

mut = []
wt = []

for c in exps['mut']:
    mut.append(np.log10(df.at[gene,c]))
for c in exps['wt']:
    wt.append(np.log10(df.at[gene,c]))


# Fit a normal distribution to the data:
wt_mu, wt_std = norm.fit(wt)
mut_mu, mut_std = norm.fit(mut)

ax = plt.subplot(2, 1, 1)
ax.hist(wt, density=True, bins=15,  alpha=0.6, color='red')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, wt_mu, wt_std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (wt_mu, wt_std)
ax.set_title(title)

ax = plt.subplot(2, 1, 2)
ax.hist(mut, bins=15, density=True, alpha=0.6, color='blue')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mut_mu, mut_std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mut_mu, mut_std)
ax.set_title(title)
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import ttest_ind

# compare distributions with t-test

ttest_ind(wt,mut)

In [None]:
ttestp = {}

for index, row in df.iterrows():
    mut = row[exps['mut']]
    wt = row[exps['wt']]
    ttestp[index] = ttest_ind(wt,mut)[1]
    
ttestps = sorted(ttestp.keys(),key = lambda x: ttestp[x])

mut_ind = []
mut = []
wt_ind = [] 
wt = []

for i,t in enumerate(ttestps[0:100]):
    for c in exps['mut']:
        mut_ind.append(i+1)
        mut.append(df.at[t,c])
    for c in exps['wt']:
        wt_ind.append(i+1)
        wt.append(df.at[t,c])
        
fig = plt.figure(figsize = (10,.1*max(wt_ind)))
ax = fig.add_subplot(1,1,1)
ax.set_xlim(1,max(wt + mut))
ax.set_ylim(max(wt_ind + mut_ind),0)
ax.set_xscale("log", nonposx='clip')
ax.set_xlabel("expression level")
ax.set_ylabel("gene - sorted by KS")
ax.scatter(wt,wt_ind,s=12,alpha=.25,c='r',edgecolor='none')
ax.scatter(mut,mut_ind,s=12,alpha=.25,c='b',edgecolor='none')

Might be better to treat this as a non-parametric test. The test most commonly used to ask if two distributions are different is the Kolmogorov-Smirnov test. https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test


In [None]:
ksv = {}

for index, row in df.iterrows():
    mut = row[exps['mut']]
    wt = row[exps['wt']]
    ksv[index] = ks_2samp(wt,mut)[1]
    
ksvs = sorted(ksv.keys(),key = lambda x: ksv[x])

mut_ind = []
mut = []
wt_ind = [] 
wt = []

for i,t in enumerate(ksvs[0:100]):
    for c in exps['mut']:
        mut_ind.append(i+1)
        mut.append(df.at[t,c])
    for c in exps['wt']:
        wt_ind.append(i+1)
        wt.append(df.at[t,c])
        
fig = plt.figure(figsize = (10,.1*max(wt_ind)))
ax = fig.add_subplot(1,1,1)
ax.set_xlim(1,max(wt + mut))
ax.set_ylim(max(wt_ind + mut_ind),0)
ax.set_xscale("log", nonposx='clip')
ax.set_xlabel("expression level")
ax.set_ylabel("gene - sorted by KS")
ax.scatter(wt,wt_ind,s=12,alpha=.25,c='r',edgecolor='none')
ax.scatter(mut,mut_ind,s=12,alpha=.25,c='b',edgecolor='none')

How does the power of these tests vary as we change the sample size?

In [None]:
#
# generate some synthetic data
#

foldchange = .5


replicates = 10000
m1 = 1000
m2 = foldchange * m1
v = .3

wt = np.random.lognormal(np.log(m1),v,replicates)
mut = np.random.lognormal(np.log(m2),v,replicates)

x = plt.hist(np.log(wt),bins=50,alpha=0.5,density=True)
x = plt.hist(np.log(mut),bins=50,alpha=0.5,density=True)



In [None]:

#
# lets look at the effect of changing the number of experiments we did
#

pvals = {}

trials = 1000

sample_sizes = [2,3,4,5,6,7,8,9,10,15,20]
for n in sample_sizes:
    pvals[n] = []
    for t in range(0,trials):
        wtr = np.random.choice(wt,n)
        mutr = np.random.choice(mut,n)
        #pvals[n].append(-np.log10(ks_2samp(wtr,mutr)[1]))
        pvals[n].append(-np.log10(ttest_ind(np.log(wtr),np.log(mutr))[1]))
    

In [None]:
fig, ax = plt.subplots(figsize=(6,4))

ax.violinplot(dataset = [pvals[v] for v in sample_sizes])
ax.set_title('')
ax.yaxis.grid(False)
ax.set_xlabel('Sample Size')
ax.set_ylabel('-log10(KS p-value)')
a = plt.xticks(range(1,len(sample_sizes)+1),sample_sizes)

In [None]:

#
# lets look at the effect of changing the number of experiments we did
#

vars = {}

trials = 1000

sample_sizes = [2,3,4,5,6,7,8,9,10,15,20,40]
for n in sample_sizes:
    vars[n] = []
    for t in range(0,trials):
        wtr = np.random.choice(wt,n)
        vars[n].append(np.std(np.log(wtr)))
        
fig, ax = plt.subplots()

ax.violinplot(dataset = [vars[v] for v in sample_sizes])
ax.set_title('')
ax.yaxis.grid(False)
ax.set_xlabel('Sample Size')
ax.set_ylabel('Variance')
a = plt.xticks(range(1,len(sample_sizes)+1),sample_sizes)

In [None]:
for v in [2,5,10,20]:
    plt.hist(vars[v],alpha=.5,label='n=%d' %v, histtype='step')

plt.legend()

In [None]:
wt = df[exps['wt']].values
mut = df[exps['mut']].values

ttest_p = ttest_ind(wt,mut,axis=1).pvalue

In [None]:
h = plt.hist(-np.log10(ttest_p),bins=np.linspace(0,50,50))

In [None]:
# how many genes are significantly different?

# we did 6505 test
# with a p-value cutoff of .05
# we should conservatively demand p < .05 / 6505

p_cut = .05 / 6505

len(ttest_p[ttest_p < p_cut])

In [None]:
# lets do some random sampling of different numbers of experiments

n = 2

wt_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
wt_rand = wt[:,wt_rand_ind]

mut_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
mut_rand = mut[:,mut_rand_ind]

ttest_p_rand = ttest_ind(wt_rand,mut_rand,axis=1).pvalue

In [None]:
plt_x = -np.log10(ttest_p)
plt_y = -np.log10(ttest_p_rand)
fig,ax = plt.subplots(1,1,figsize=(5,5))
maxval = max(max(plt_x),max(plt_y))
ax.scatter(plt_x,plt_y)
ax.set_xlim(0,maxval)
ax.set_ylim(0,maxval)
ax.set_xlabel("pvalue from full data")
ax.set_ylabel("pvalue from sampled data")

In [None]:
# how many genes are significantly different?

# we did 6505 test
# with a p-value cutoff of .05
# we should conservatively demand p < .05 / 6505

p_cut = .05 / 6505

print ("Significant in full: ", len(ttest_p[ttest_p < p_cut]))
print ("Significant in sample: ", len(ttest_p_rand[ttest_p_rand < p_cut]))

In [None]:
data = []
for n in range(2,40):
    for t in range(0,100):
        wt_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
        wt_rand = wt[:,wt_rand_ind]

        mut_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
        mut_rand = mut[:,mut_rand_ind]

        ttest_p_rand = ttest_ind(wt_rand,mut_rand,axis=1).pvalue
        nsig = len(ttest_p_rand[ttest_p_rand < p_cut])
        data.append([n,nsig])
        
sims = pd.DataFrame(data=data,columns=['n','nsig'])

In [None]:
fig,ax = plt.subplots()
ax.scatter(sims.n,sims.nsig,s=1)
t = ax.set_xticks(range(5,40,5))

In [None]:
data = []
for n in range(2,40):
    for t in range(0,100):
        wt_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
        wt_rand = wt[:,wt_rand_ind]

        mut_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
        mut_rand = mut[:,mut_rand_ind]

        ttest_p_rand = ttest_ind(wt_rand,mut_rand,axis=1,equal_var=False).pvalue
        nsig = len(ttest_p_rand[ttest_p_rand < p_cut])
        data.append([n,nsig])
        
sims = pd.DataFrame(data=data,columns=['n','nsig'])

In [None]:
fig,ax = plt.subplots()
ax.scatter(sims.n,sims.nsig,s=1)
t = ax.set_xticks(range(5,40,5))

In [None]:
# lets do some random sampling of different numbers of experiments

n = 3

wt_rand_ind = np.random.choice(range(0,48),replace=False,size=n)
wt_rand = wt[:,wt_rand_ind]

wt_mean = np.mean(wt,axis=1)
wt_mean_rand = np.mean(wt_rand,axis=1)
wt_var = np.var(wt,axis=1)
wt_var_rand = np.var(wt_rand,axis=1)


In [None]:
plt_x = wt_mean
plt_y = wt_mean_rand
maxval = max(max(plt_x),max(plt_y))
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlim(1,maxval)
ax.set_ylim(1,maxval)
ax.set_xlabel("mean full data")
ax.set_ylabel("mean in sampled data")
ax.plot(np.linspace(1,maxval,10),np.linspace(1,maxval,10),color='red')

In [None]:
plt_x = wt_var
plt_y = wt_var_rand
maxval = max(max(plt_x),max(plt_y))
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlim(1,maxval)
ax.set_ylim(1,maxval)
ax.set_xlabel("variance full data")
ax.set_ylabel("variance in sampled data")
ax.plot(np.linspace(1,maxval,10),np.linspace(1,maxval,10),color='red')

In [None]:
plt_x = wt_mean
plt_y = wt_var
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1,max(plt_x))
ax.set_ylim(1,max(plt_y))
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlabel("mean")
ax.set_ylabel("variance")


In [None]:
plt_x = wt_mean_rand
plt_y = wt_var_rand
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1,max(plt_x))
ax.set_ylim(1,max(plt_y))
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlabel("mean")
ax.set_ylabel("variance")


In [None]:
plt_x = wt_mean_rand
plt_y = wt_var
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1,max(plt_x))
ax.set_ylim(1,max(plt_y))
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlabel("mean in rand")
ax.set_ylabel("variance")


In [None]:
def runave(x,y,n):
    sinds = x.argsort()
    xsrt = x[sinds]
    xsrt = x[sinds]
    ysrt = y[sinds]
    return (xsrt,np.convolve(ysrt,np.ones((n,))/n,mode='same'))

In [None]:
x = wt_mean
y = wt_var
xs,r = runave(x,y,100)

In [None]:
plt_x = x
plt_y = y
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1,max(plt_x))
ax.set_ylim(1,max(plt_y))
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlabel("mean in rand")
ax.set_ylabel("variance")
plt_x = xs
plt_y = r
ax.scatter(plt_x,plt_y,s=1,c='red')



In [None]:
x = wt_mean_rand
y = wt_var_rand
xs,r = runave(x,y,100)

In [None]:
plt_x = x
plt_y = y
fig,ax = plt.subplots(1,1,figsize=(5,5))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1,max(plt_x))
ax.set_ylim(1,max(plt_y))
ax.scatter(plt_x,plt_y,s=1)
ax.set_xlabel("mean in rand")
ax.set_ylabel("variance")
plt_x = xs
plt_y = r
ax.scatter(plt_x,plt_y,s=1,c='red')

