In [1]:
import os
import datetime
import numpy as np
import xarray as xa
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

## Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

# Introduction

Perfect confounding occurs frequently in differential expression assays, often if biological replicates cannot be spread acrodd conditions: This is often the case with animals or patients. Perfect confoudnding implies that the corresponding design matrix is not full rank and the model underdetermined. This can be circumvented by certain tricks which essentially regress repplicates to reference replicates. We believe that this is firstly undesirable as the condition coefficients depend on the identity of the reference replicates and accordingly on the ordering of the replicates, which has no experiental meaning and is purely a result of sample labels. Secondly, such tricks may be hard to come up with in hard cases such as presented in example 2. Here, we show how one can solve both problems by constraining parameterse in the model. 

# Example 1: easy

## Simulate data

In this example, we have 4 biological replicates (animals, patients, cell culture replicates etc.) in a treatment experiment: 2 in each condition (treated, untreated). Accordingly, there is perfect confounding at this level. We circumvent this by constraining the biological replicate coefficients to not model mean trends. 

### Define design matrices

In [4]:
ncells = 2000
dmat = np.zeros([ncells, 6])
dmat[:,0] = 1
dmat[:500,1] = 1 # bio rep 1
dmat[500:1000,2] = 1 # bio rep 2
dmat[1000:1500,3] = 1 # bio rep 3
dmat[1500:2000,4] = 1 # bio rep 4
dmat[1000:2000,5] = 1 # condition effect
print(np.unique(dmat, axis=0))

[[1. 0. 0. 0. 1. 1.]
 [1. 0. 0. 1. 0. 1.]
 [1. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]]


In [5]:
sim = glm.models.nb_glm.Simulator(num_features=100)

In [6]:
sim.parse_dmat_loc(dmat = dmat)
sim.parse_dmat_scale(dmat = dmat)
sim.generate_params()
sim.generate_data()

### Simulated model data:

In [7]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[ 5554,  3696, 15028, ...,  3384,  7828,  8991],
       [ 9300,  3033,  8415, ...,  1973,  7096,  7946],
       [ 7489,  3839,  9305, ...,  3096,  7724, 11018],
       ...,
       [ 6995,  3457, 16165, ..., 15397,  7509,  3128],
       [25949,  4912, 19161, ...,  7005, 10716,  2587],
       [19859,  6244, 18609, ..., 13881,  5867,  4090]])
Dimensions without coordinates: observations, features

In [8]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 1., 1.],
       [1., 0., 0., 1., 0., 1.],
       [1., 0., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.]])

### The parameters used to generate this data:

In [9]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 6, features: 100)>
array([[ 8.985995,  8.013137,  8.946747, ...,  8.819069,  8.601582,  8.457401],
       [ 0.140413,  0.189254,  0.233925, ..., -0.579045,  0.09183 ,  0.620308],
       [ 0.399589,  0.123379,  0.134321, ...,  0.066732,  0.568198,  0.242539],
       [ 0.508742,  0.07867 ,  0.489318, ...,  0.236383,  0.509544,  0.073027],
       [ 0.545695,  0.49701 ,  0.296227, ...,  0.648868,  0.597661,  0.280183],
       [ 0.2958  ,  0.046311,  0.549773, ...,  0.17884 , -0.393654, -0.607054]])
Coordinates:
  * design_loc_params  (design_loc_params) <U2 'p0' 'p1' 'p2' 'p3' 'p4' 'p5'
Dimensions without coordinates: features

In [10]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 6, features: 100)>
array([[ 1.609438,  2.302585,  1.94591 , ...,  1.791759,  1.098612,  2.302585],
       [ 0.630657,  0.574183, -0.09797 , ..., -0.657355, -0.114174,  0.088411],
       [ 0.288293, -0.426868,  0.334359, ...,  0.576601, -0.035779, -0.091614],
       [ 0.559647, -0.151628,  0.013794, ..., -0.179191,  0.415133,  0.499975],
       [ 0.211942,  0.47216 ,  0.433263, ...,  0.019253,  0.430978,  0.361936],
       [ 0.257496,  0.215783,  0.150664, ..., -0.102467,  0.384797,  0.584132]])
Coordinates:
  * design_scale_params  (design_scale_params) <U2 'p0' 'p1' 'p2' 'p3' 'p4' 'p5'
Dimensions without coordinates: features

## Constraints for model

In [11]:
dmat_est_loc = sim.design_loc

In [12]:
dmat_est_scale = sim.design_scale

Build constraints based on sets of parameters that have to sum to zero. Each of these constraints is enforced by binding one of these parameters to the rest of the set. Such a constraint is encoded by assigning a 1 to each parameter in the set and a -1 to to the dependent parameter.

In [13]:
constraints_loc = np.zeros([2, dmat_est_loc.shape[1]])
# Constraint 0: Account for perfect confouding at biological replicate and treatment level 
# by constraining biological replicate coefficients not to produce mean effects across conditions.
constraints_loc[0,3] = -1
constraints_loc[0,4:5] = 1
# Constraint 1: Account for fact that first level of biological replicates was not absorbed into offset.
constraints_loc[1,1] = -1
constraints_loc[1,2:5] = 1
constraints_loc

array([[ 0.,  0.,  0., -1.,  1.,  0.],
       [ 0., -1.,  1.,  1.,  1.,  0.]])

In [14]:
constraints_scale = constraints_loc.copy()

In [15]:
from numpy.linalg import matrix_rank
constraints_loc_mod = constraints_loc.copy()
constraints_loc_mod[constraints_loc_mod==-1] = 1
print(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))
print("rank deficiency without constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0)]))))
print("rank deficiency with constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))))

[[1. 0. 0. 0. 1. 1.]
 [1. 0. 0. 1. 0. 1.]
 [1. 0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0.]
 [0. 1. 1. 1. 1. 0.]]
rank deficiency without constraints: 2
rank deficiency with constraints: 0


## Estimate the model

In [19]:
?glm.models.nb_glm.InputData.new

In [18]:
X = sim.X
design_loc = dmat_est_loc
design_scale = dmat_est_scale

# input data
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc,
    design_scale=design_scale,
    constraints_loc=constraints_loc,
    constraints_scale=constraints_scale)

TypeError: new() got an unexpected keyword argument 'constraints_loc'

### Set up estimator:

In [None]:
estimator = glm.models.nb_glm.Estimator(input_data, quick_scale=False)
estimator.initialize()

### Train

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [None]:
estimator.train_sequence('QUICK')

## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [None]:
estimator.par_link_loc

In [None]:
estimator.par_link_scale

### Check that constraints were met

These parameter sets should sum to zero for each gene.

In [None]:
np.max(estimator.par_link_loc[1,:]+np.sum(estimator.par_link_loc[2:5,:], axis=0))

In [None]:
np.max(np.sum(estimator.par_link_loc[1:3,:], axis=0)+np.sum(estimator.par_link_loc[3:5,:], axis=0))

## Comparing the results with the simulated data:

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

# Example 2: advanced

## Simulate some data

In this example, we have 4 biological replicates (animals, patients, cell culture replicates etc.) in a treatment experiment: 2 in each condition (treated, untreated). Accordingly, there is perfect confounding at this level already. We circumvent this by constraining the biological replicate coefficients to not model mean trends (constraints 0,1). Secondly, there a are technical replicates which contain cells from one biological replicate from each condition. Each biological replicate was assigned to one treated-untreated sample pair and each pair split into two technical replicates. Again, we correct perfect confouding by constrainig the techincal replicate coefficients not to model mean effects by constraints 2,3.

### Define design matrices

In [None]:
ncells = 2000
dmat = np.zeros([ncells, 10])
dmat[:,0] = 1
dmat[:500,1] = 1 # bio rep 1
dmat[500:1000,2] = 1 # bio rep 2
dmat[1000:1500,3] = 1 # bio rep 3
dmat[1500:2000,4] = 1 # bio rep 4
dmat[0:250,5] = 1 # tech rep 1
dmat[1000:1250,5] = 1 # tech rep 1
dmat[250:500,6] = 1 # tech rep 2
dmat[1250:1500,6] = 1 # tech rep 2
dmat[500:750,7] = 1 # tech rep 3
dmat[1500:1750,7] = 1 # tech rep 3
dmat[750:1000,8] = 1 # tech rep 4
dmat[1750:2000,8] = 1 # tech rep 4
dmat[1000:2000,9] = 1 # condition effect
print(np.unique(dmat, axis=0))

In [None]:
sim = glm.models.nb_glm.Simulator(num_features=100)

In [None]:
sim.parse_dmat_loc(dmat = dmat)
sim.parse_dmat_scale(dmat = dmat)
sim.generate_params()
sim.generate_data()

### Simulated model data:

In [None]:
sim.X

## Constraints for model

In [None]:
dmat_est_loc = sim.design_loc

In [None]:
dmat_est_scale = sim.design_scale

Build constraints based on sets of parameters that have to sum to zero. Each of these constraints is enforced by binding one of these parameters to the rest of the set. Such a constraint is encoded by assigning a 1 to each parameter in the set and a -1 to to the dependent parameter.

In [None]:
np.unique(dmat_est_loc, axis=0)

In [None]:
constraints_loc = np.zeros([4, dmat_est_loc.shape[1]])
# Constraint 0: Account for perfect confouding at biological replicate and treatment level 
# by constraining biological replicate coefficients not to produce mean effects across conditions.
constraints_loc[0,3] = -1
constraints_loc[0,4:5] = 1
# Constraint 1: Account for fact that first level of biological replicates was not absorbed into offset. 
constraints_loc[1,1] = -1
constraints_loc[1,2:5] = 1
# Constraint 2: Account for perfect confouding at biological replicate and technical replicate 
# by constraining technical replicate coefficients not to produce mean effects across biological replicates.
constraints_loc[2,7] = -1
constraints_loc[2,8:9] = 1
# Constraint 3: Account for fact that first level of technical replicates was not absorbed into offset. 
constraints_loc[3,5] = -1
constraints_loc[3,6:9] = 1

constraints_loc

In [None]:
constraints_scale = constraints_loc.copy()

In [None]:
from numpy.linalg import matrix_rank
constraints_loc_mod = constraints_loc.copy()
constraints_loc_mod[constraints_loc_mod==-1] = 1
print(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))
print("rank deficiency without constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0)]))))
print("rank deficiency with constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))))

## Estimate the model

In [None]:
X = sim.X
design_loc = dmat_est_loc
design_scale = dmat_est_scale

# input data
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc,
    design_scale=design_scale,
    constraints_loc=constraints_loc,
    constraints_scale=constraints_scale)

### Set up estimator:

Note that there is no closed form estimator for the mean model here due to the confounding. The model is initialised with least squares but the mean model is also trained.

In [None]:
estimator = glm.models.nb_glm.Estimator(input_data, quick_scale=False)
estimator.initialize()

### Train

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [None]:
estimator.train_sequence('QUICK')

## Obtaining the results

### Check that constraints were met

These parameter sets should sum to zero for each gene.

In [None]:
np.max(estimator.par_link_loc[1,:]+np.sum(estimator.par_link_loc[2:5,:], axis=0))

In [None]:
np.max(np.sum(estimator.par_link_loc[1:3,:], axis=0)+np.sum(estimator.par_link_loc[3:5,:], axis=0))

## Comparing the results with the simulated data:

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

# Example 3: advanced

## Simulate some data

In this example, we have the same scenario as in example 2 but one technical replicate is missing. We have to drop the corresponding constraint and remove the two parameters belonging to this pair of technical replicates.

### Define design matrices

In [None]:
ncells = 2000
dmat = np.zeros([ncells, 8])
dmat[:,0] = 1
dmat[:500,1] = 1 # bio rep 1
dmat[500:1000,2] = 1 # bio rep 2
dmat[1000:1500,3] = 1 # bio rep 3
dmat[1500:2000,4] = 1 # bio rep 4
dmat[0:250,5] = 1 # tech rep 1
dmat[1000:1250,5] = 1 # tech rep 1
dmat[250:500,6] = 1 # tech rep 2
dmat[1250:1500,6] = 1 # tech rep 2
dmat[1000:2000,7] = 1 # condition effect
print(np.unique(dmat, axis=0))

In [None]:
sim = glm.models.nb_glm.Simulator(num_features=100)

In [None]:
sim.parse_dmat_loc(dmat = dmat)
sim.parse_dmat_scale(dmat = dmat)
sim.generate_params()
sim.generate_data()

### Simulated model data:

In [None]:
sim.X

## Constraints for model

In [None]:
dmat_est_loc = sim.design_loc

In [None]:
dmat_est_scale = sim.design_scale

Build constraints based on sets of parameters that have to sum to zero. Each of these constraints is enforced by binding one of these parameters to the rest of the set. Such a constraint is encoded by assigning a 1 to each parameter in the set and a -1 to to the dependent parameter.

In [None]:
np.unique(dmat_est_loc, axis=0)

In [None]:
constraints_loc = np.zeros([3, dmat_est_loc.shape[1]])
# Constraint 0: Account for perfect confouding at biological replicate and treatment level 
# by constraining biological replicate coefficients not to produce mean effects across conditions.
constraints_loc[0,3] = -1
constraints_loc[0,4:5] = 1
# Constraint 1: Account for fact that first level of biological replicates was not absorbed into offset. 
constraints_loc[1,1] = -1
constraints_loc[1,2:5] = 1
# Constraint 2: Account for fact that first level of technical replicates was not absorbed into offset. 
constraints_loc[2,5] = -1
constraints_loc[2,6:7] = 1

constraints_loc

In [None]:
constraints_scale = constraints_loc.copy()

In [None]:
from numpy.linalg import matrix_rank
constraints_loc_mod = constraints_loc.copy()
constraints_loc_mod[constraints_loc_mod==-1] = 1
print(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))
print("rank deficiency without constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0)]))))
print("rank deficiency with constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc_mod]))))

## Estimate the model

In [None]:
X = sim.X
design_loc = dmat_est_loc
design_scale = dmat_est_scale

# input data
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc,
    design_scale=design_scale,
    constraints_loc=constraints_loc,
    constraints_scale=constraints_scale)

### Set up estimator:

Note that there is no closed form estimator for the mean model here due to the confounding. The model is initialised with least squares but the mean model is also trained.

In [None]:
estimator = glm.models.nb_glm.Estimator(input_data, quick_scale=False)
estimator.initialize()

### Train

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [None]:
estimator.train_sequence('QUICK')

## Obtaining the results

### Check that constraints were met

These parameter sets should sum to zero for each gene.

In [None]:
np.max(estimator.par_link_loc[1,:]+np.sum(estimator.par_link_loc[2:5,:], axis=0))

In [None]:
np.max(np.sum(estimator.par_link_loc[1:3,:], axis=0)+np.sum(estimator.par_link_loc[3:5,:], axis=0))

## Comparing the results with the simulated data:

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)