In [None]:
%matplotlib inline

In [None]:
# import libraries 

import pandas as pd

# this is here to shut off some annoying warnings from pandas
pd.options.mode.chained_assignment = None

# matplotlib is one of the main plotting libraries we're going to use
import matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline

# the other plotting library is seaborn - we'll use both during the class

# import seaborn as sns

# numpy and scipy are for handling numerical and scientific data

import numpy as np
import scipy as sp

#import statsmodels.formula.api as smf # basic statistical modeling

from scipy.stats.stats import pearsonr 

import os

from scipy import stats
from scipy.stats import ks_2samp
from scipy.stats import entropy
from scipy.stats import norm
from scipy.stats import lognorm
from scipy.stats import nbinom
from scipy.stats.mstats import gmean

#from pandas.tools.plotting import scatter_matrix



I wanted to give you a little primer on making plots and figures so you can follow the code I've written and start to write your own. So this dataframe will walk through the construction of plots for showing expression difference between two samples. 

In [None]:
# first prep the data
# load in dataframe with expression data

datafile = "data/barton/Barton_combined_Ygenes.txt"
df = pd.read_csv(datafile, sep='\t')   # the sep='\t' tells pandas that it is a tab separated file

df = df.set_index('Gene')

df.head()

In [None]:
# define some lists of columns

exps = {}
exps['wt'] = [c for c in df.columns if c.startswith('WT')]
exps['mut'] = [c for c in df.columns if c.startswith('Snf2')]

# it's possible to "add" two python lists to create a new list with the elements of both
exps['all'] = exps['wt'] + exps['mut']

In [None]:
# just to clean data up a tiny bit I'm going to get rid of all genes that are all 0's

df = df.loc[df[exps['all']].sum(axis=1) > 0]

In [None]:
# now calculate mean as our reference value

df['ref'] = df[exps['all']].mean(axis=1)

In [None]:
for c in exps['all']:
    norm = np.median(df[c]/df['ref'])
    df[c] = df[c] / norm

# recalculate the reference

df['ref'] = df[exps['all']].mean(axis=1)

OK. Now that we've done that, let's construct a plot. I'm going to do this using entirely matplotlib, and walk you through the different things you do to make a full plot.

In [None]:
# step 1: choose the data to plot
# in this case let's pick a wildtype and mutant column

x = df['WT_rep01'].values
y = df['Snf2_rep01'].values

# x and y and both numpy arrays

In [None]:
# you can just plot these data directly via matplotlib like this

plt.scatter(x,y)

In [None]:
# but I'm actually going to use a slightly different feature 
# of matplot lib because it gives us more options for customization
# and creating figures - and this is to use subplots which you do like this

fig, ax = plt.subplots(1,1)

In [None]:
# that command makes an empty subplot and displays it 
# the nice thing about subplots is we can make multiple of them
# something we'll make use of later
# here for example we make 2 rows and 3 columns of plots
# i'll show you later how to use this

fig, ax = plt.subplots(2,3)

In [None]:
# but for now we're just going to make one subplot
# and we're going to specify its size

fig, ax = plt.subplots(1,1,figsize=(5,5))

In [None]:
# when we call the subplot function, it not only 
# creates this plot, it gives us two variables - fig, ax - to access the plot
# we'll mostly use ax here

In [None]:
# first step is to make a scatter plot

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y)

In [None]:
# as we saw before, to plot these data we want to log transform the axes
# which we do with the set_xscale and set_yscale functions

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y)

ax.set_xscale('log')
ax.set_yscale('log')

In [None]:
# for some weird reason matplotlib always screws these plots up 
# if we don't tell it what the start and stop point should be on the axes
# which we specific using the set_xlim and set_ylim commands
# first i'll just hardcode these 

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y)

ax.set_xscale('log')
ax.set_yscale('log')

# this command says "only plot points along the corresponding axis that are betwen 1 and 10000"
ax.set_xlim(1,10000)
ax.set_ylim(1,10000)

In [None]:
# i just chose those values arbitrarily 
# let's actually compute what they should be by looking at the values of the axis
# for log plots there's an effective lower limit of 1 since log's of non-positive numbers are not defined
# so we just have to computer the upper limit

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y)

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y))

# this command says "only plot points along the corresponding axis that are betwen 1 and ul"
ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

In [None]:
# probably better to actually go a bit beyond the last point so you 
# see all of the data on the plot
# so let's set ul to by 2 * max
# which on a log plot is a small buffer

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y)

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

In [None]:
# there's a lot of points here, so let's make them 
# a bit smaller using the size option for the scatter plot
# specified here as s=1

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y,s=1)

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

In [None]:
# now before we go further, let's label the axes
# and give the plot a title

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y,s=1)

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in WT_rep01")
ax.set_ylabel("expression in Snf2_rep01")
ax.set_title("Comparison of WT_rep01 and Snf2_rep01")

In [None]:
# let's also make it easy to change which experiments we're comparing by
# making them variables

c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.scatter(x,y,s=1)

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

# and construct the axis labels using these variables

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

In [None]:
# ok - that's the basic outline of a plot
# now we want to highlight values that are up/down 2x
# to do this we have to do two things
# 1) select a subset of points that satify these criteria
# 2) plot them separately using different colors

# all the setup stuff is the same

c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

# except that we hold off on making the plots which we do here

# first let's get overexpressed values

# we do this by selecting rows from df where c2 > 2 * c1
oedf = df[df[c2] > 2 * df[c1]]

x = oedf[c1].values
y = oedf[c2].values

ax.scatter(x,y,s=1,c='red')



In [None]:
c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

oedf = df[df[c2] > 2 * df[c1]]

x = oedf[c1].values
y = oedf[c2].values

ax.scatter(x,y,s=1,c='red')

# now let's add underexpressed

uedf = df[df[c1] > 2 * df[c2]]

x = uedf[c1].values
y = uedf[c2].values

# note this important feature - we can add 
# multiple plots to the same figure

ax.scatter(x,y,s=1,c='green')


In [None]:
c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

oedf = df[df[c2] > 2 * df[c1]]

x = oedf[c1].values
y = oedf[c2].values

ax.scatter(x,y,s=1,c='red')

uedf = df[df[c1] > 2 * df[c2]]

x = uedf[c1].values
y = uedf[c2].values

ax.scatter(x,y,s=1,c='green')

# now plot unchanged points
# using a nice pandas function that selects
# values that are between two other values

ucdf = df[(df[c1].between(df[c2] * .5, df[c2] * 2.0))]

x = ucdf[c1].values
y = ucdf[c2].values

ax.scatter(x,y,s=1,c='black')



In [None]:
c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

oedf = df[df[c2] > 2 * df[c1]]

x = oedf[c1].values
y = oedf[c2].values

ax.scatter(x,y,s=1,c='red')

uedf = df[df[c1] > 2 * df[c2]]

x = uedf[c1].values
y = uedf[c2].values

ax.scatter(x,y,s=1,c='green')

ucdf = df[(df[c1].between(df[c2] * .5, df[c2] * 2.0))]

x = ucdf[c1].values
y = ucdf[c2].values

# one last little tweek
# i don't want the black spots to dominate visually
# so i make them somewhat dimmer using an 
# alpha value of < 1 which makes them somewhat transparent

ax.scatter(x,y,s=1,c='black',alpha=.1)



In [None]:
c1 = "WT_rep01"
c2 = "Snf2_rep01"

x = df[c1].values
y = df[c2].values

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_xscale('log')
ax.set_yscale('log')

ul = max(max(x),max(y)) * 2

ax.set_xlim(1,ul)
ax.set_ylim(1,ul)

ax.set_xlabel("expression in " + c1)
ax.set_ylabel("expression in " + c2)
ax.set_title("Comparison of " + c1 + " " + c2)

oedf = df[df[c2] > 2 * df[c1]]

x = oedf[c1].values
y = oedf[c2].values

# finally let's add a legend that says what
# the red and the green are and counts the number of elements 
# this uses a string formatting command which i can explain later

ax.scatter(x,y,s=1,c='red', label='2x up, n = %d' % len(oedf))

uedf = df[df[c1] > 2 * df[c2]]

x = uedf[c1].values
y = uedf[c2].values

ax.scatter(x,y,s=1,c='green', label = '2x down, n = %d' % len(uedf))

ucdf = df[(df[c1].between(df[c2] * .5, df[c2] * 2.0))]

x = ucdf[c1].values
y = ucdf[c2].values

ax.scatter(x,y,s=1,c='black',alpha=.1)
ax.legend()


In [None]:
# now let's say we wanted to make a figure with this kind of plot for
# several different comparisons
# I could just hard code each subpanel of a figure
# but that would be a pain - so instead i'm going to create a function
# the arguments to the function are going to be 
# 1) a dataframe
# 2) the columns to plot
# 3) the "ax" object for the subpanel we want to plot it on
# you'll see why this is useful in a second
# first here's the function

# this line says we're making a function
# and specifies what variables we want to pass to it

def plotdiff(df,c1,c2,ax):

    # the rest is just what we did before
    # except I don't create the figure
    
    x = df[c1].values
    y = df[c2].values

    ax.set_xscale('log')
    ax.set_yscale('log')

    ul = max(max(x),max(y)) * 2

    ax.set_xlim(1,ul)
    ax.set_ylim(1,ul)

    ax.set_xlabel("expression in " + c1)
    ax.set_ylabel("expression in " + c2)
    ax.set_title("Comparison of " + c1 + " " + c2)

    oedf = df[df[c2] > 2 * df[c1]]

    x = oedf[c1].values
    y = oedf[c2].values

    ax.scatter(x,y,s=1,c='red', label='2x up, n = %d' % len(oedf))

    uedf = df[df[c1] > 2 * df[c2]]

    x = uedf[c1].values
    y = uedf[c2].values

    ax.scatter(x,y,s=1,c='green', label = '2x down, n = %d' % len(uedf))

    ucdf = df[(df[c1].between(df[c2] * .5, df[c2] * 2.0))]

    x = ucdf[c1].values
    y = ucdf[c2].values

    # one last little tweek
    # i don't want the black spots to dominate visually
    # so i make them somewhat dimmer using an 
    # alpha value of < 1 which makes them somewhat transparent

    ax.scatter(x,y,s=1,c='black',alpha=.1)
    ax.legend()
    
    
    

In [None]:
# here I create a figure

fig, ax = plt.subplots(1,1,figsize=(5,5))

# and then I call our function to plot onto it

plotdiff(df,'WT_rep01','WT_rep02',ax)

# cool, huh?

In [None]:
# Now we can make a figure with muliple such plots easily
# Let's make a 2x3 grid

fig, ax = plt.subplots(2,3,figsize=(15,10))

# unlike when it's a 1x1 grid where ax was just a variable
# here it's an array 
# so first we'll plot in the upper left subplot

plotdiff(df,'WT_rep01','Snf2_rep01',ax[0][0])

In [None]:

fig, ax = plt.subplots(2,3,figsize=(15,10))

plotdiff(df,'WT_rep01','Snf2_rep01',ax[0][0])

# let's just do the rest

plotdiff(df,'WT_rep01','Snf2_rep02',ax[0][1])
plotdiff(df,'WT_rep01','Snf2_rep03',ax[0][2])
plotdiff(df,'WT_rep01','Snf2_rep04',ax[1][0])
plotdiff(df,'WT_rep01','Snf2_rep05',ax[1][1])
plotdiff(df,'WT_rep01','Snf2_rep06',ax[1][2])


In [None]:

fig, ax = plt.subplots(2,3,figsize=(15,10))

plotdiff(df,'WT_rep01','Snf2_rep01',ax[0][0])
plotdiff(df,'WT_rep01','Snf2_rep02',ax[0][1])
plotdiff(df,'WT_rep01','Snf2_rep03',ax[0][2])
plotdiff(df,'WT_rep01','Snf2_rep04',ax[1][0])
plotdiff(df,'WT_rep01','Snf2_rep05',ax[1][1])
plotdiff(df,'WT_rep01','Snf2_rep06',ax[1][2])

# if you add this command it cleans up overlapping axes
plt.tight_layout()

In [None]:
# and finally - just for fun - let's make a figure comparing every experiment to the reference
# be patient - this is slow

fig, ax = plt.subplots(12,4,figsize=(16,48))

for i in range(0,48):
    # we need to figure out where on the grid each subplot should go
    # the row is 
    row = int(i/4) # this will put 0-3 in row 0, 4-7 in row 1, etc..
    col = i % 4 # this will put 0,4,8,12 etc in col 0, 1,5,9,13 in col 1 and so on
    
    plotdiff(df,'WT_rep%02d' % (i+1),'Snf2_rep%02d' % (i+1), ax[row][col])
    
plt.tight_layout()