In [1]:
import os
import datetime
import numpy as np
import xarray as xa
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=100)

In [5]:
dmat = xa.DataArray(np.zeros([2000, 6]), dims=["observations", "design_loc_params"])
dmat.coords["design_loc_params"] = ['intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'cond2']
dmat[:,0] = 1
dmat[0:500,1] = 1
dmat[500:1000,2] = 1
dmat[1000:1500,3] = 1
dmat[1500:2000,4] = 1
dmat[1000:2000,5] = 1
sim.data["design_loc"] = dmat

In [6]:
dmat = xa.DataArray(np.zeros([2000, 6]), dims=["observations", "design_scale_params"])
dmat.coords["design_scale_params"] = ['intercept', 'bio1', 'bio2', 'bio3', 'bio4', 'cond2']
dmat[:,0] = 1
dmat[0:500,1] = 1
dmat[500:1000,2] = 1
dmat[1000:1500,3] = 1
dmat[1500:2000,4] = 1
dmat[1000:2000,5] = 1
sim.data["design_scale"] = dmat

In [7]:
sim.generate()

In [8]:
sim.design_loc

<xarray.DataArray 'design_loc' (observations: 2000, design_loc_params: 6)>
array([[1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1., 1.],
       [1., 0., 0., 0., 1., 1.]])
Coordinates:
  * design_loc_params  (design_loc_params) <U9 'intercept' 'bio1' 'bio2' ...
Dimensions without coordinates: observations

### Simulated model data:

In [9]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[  478, 18830, 12251, ...,  7559,  6097,  5756],
       [  202, 23037,  9357, ...,  3648,  5570,  7311],
       [  152, 34740, 12296, ...,  6450,  7926,  7561],
       ...,
       [  955,  9602,  6298, ...,  3246, 14386, 31011],
       [ 1117, 11882,  7861, ...,  2951, 25191, 28232],
       [  771, 13197,  6668, ...,  4065, 28411, 33315]])
Dimensions without coordinates: observations, features

In [10]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 1., 1.],
       [1., 0., 0., 1., 0., 1.],
       [1., 0., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.]])

### The parameters used to generate this data:

In [11]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 6, features: 100)>
array([[ 6.045812,  9.167033,  8.973297, ...,  9.021496,  9.014817,  9.089966],
       [-0.256734,  0.58049 ,  0.648218, ..., -0.200294, -0.171946, -0.338725],
       [ 0.053395,  0.02412 ,  0.013432, ...,  0.527793,  0.390301,  0.131826],
       [-0.480349, -0.43441 ,  0.12154 , ...,  0.47683 , -0.238827, -0.410813],
       [ 0.124989, -0.272484,  0.321173, ..., -0.660628,  0.605444,  0.671045],
       [ 0.611672,  0.105642, -0.473701, ..., -0.482572,  0.483994,  0.643748]])
Coordinates:
  * design_loc_params  (design_loc_params) <U9 'intercept' 'bio1' 'bio2' ...
Dimensions without coordinates: features

In [12]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 6, features: 100)>
array([[ 2.079442,  0.693147,  2.197225, ...,  2.079442,  1.386294,  2.079442],
       [-0.281658, -0.448469,  0.013467, ...,  0.045641, -0.270166,  0.38107 ],
       [ 0.220301,  0.601458, -0.47555 , ..., -0.141956,  0.546687,  0.684411],
       [ 0.371832, -0.017138,  0.417785, ...,  0.634734, -0.3468  , -0.538411],
       [-0.250497,  0.400265,  0.507651, ..., -0.200762, -0.182734,  0.246663],
       [ 0.393928,  0.011343, -0.083216, ..., -0.611615,  0.252928,  0.487104]])
Coordinates:
  * design_scale_params  (design_scale_params) <U9 'intercept' 'bio1' 'bio2' ...
Dimensions without coordinates: features

## Constraints for model

In [13]:
constraints_loc = np.zeros([2, sim.design_loc.shape[1]])
constraints_loc[0,3] = -1
constraints_loc[0,4:5] = 1
constraints_loc[1,1] = -1
constraints_loc[1,2:5] = 1
constraints_loc

array([[ 0.,  0.,  0., -1.,  1.,  0.],
       [ 0., -1.,  1.,  1.,  1.,  0.]])

In [14]:
constraints_scale = np.zeros([2, sim.design_scale.shape[1]])
constraints_scale[0,3] = -1
constraints_scale[0,4:5] = 1
constraints_scale[1,1] = -1
constraints_scale[1,2:5] = 1
constraints_scale

array([[ 0.,  0.,  0., -1.,  1.,  0.],
       [ 0., -1.,  1.,  1.,  1.,  0.]])

## Estimate the model

In [15]:
input_data = None

In [16]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc,
    design_scale=design_scale)
input_data.constraints_loc = constraints_loc
input_data.constraints_scale = constraints_scale

In [17]:
input_data.constraints_loc = constraints_loc
input_data.constraints_scale = constraints_scale

### set up estimator:

In [18]:
estimator = glm.models.nb_glm.Estimator(input_data, quick_scale=False)
estimator.initialize()

Using closed-form MLE initialization for mean
Should train mu: False
Using closed-form MME initialization for dispersion
Should train r: True


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Graph was finalized.
Running local_init_op.
Done running local_init_op.


### train

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [19]:
estimator.train_sequence('QUICK')

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True}]
Beginning with training sequence #1
Step: 1	loss: 7654.568380
Step: 2	loss: 5003.113677
Step: 3	loss: 4057.491532
Step: 4	loss: 4451.445869
Step: 5	loss: 7186.717137
Step: 6	loss: 4980.996485
Step: 7	loss: 4360.479423
Step: 8	loss: 4588.746617
Step: 9	loss: 8071.190809
Step: 10	loss: 4395.710618
Step: 11	loss: 4569.978129
Step: 12	loss: 4066.625962
Step: 13	loss: 7627.783795
Step: 14	loss: 4937.498953
Step: 15	loss: 4385.357833
Step: 16	loss: 4149.012362
Step: 17	loss: 7190.199802
Step: 18	loss: 5445.378308
Step: 19	loss: 4121.230940
Step: 20	loss: 4341.818785
Step: 21	loss: 7381.279913
Step: 22	loss: 4910.779687
Step: 23	loss: 4283.777700
Step: 24	loss: 4522.343336
Step: 25	loss: 7936.285462
Step: 26	loss: 4750.447752
Step: 27	loss: 4293.079547
Step: 28	loss: 4117.959345
Step: 29	loss: 8572.606618
Step

## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [20]:
estimator.par_link_loc

<xarray.DataArray (design_loc_params: 6, features: 100)>
array([[ 4.620341,  6.914816,  6.738789, ...,  6.790247,  7.054823,  7.019336],
       [-1.461348, -2.26702 , -2.214622, ..., -2.737714, -2.348623, -2.205947],
       [ 1.461348,  2.26702 ,  2.214622, ...,  2.737714,  2.348623,  2.205947],
       [ 0.053355, -0.352749, -0.363557, ...,  0.531118,  0.135382,  0.059618],
       [-0.053355,  0.352749,  0.363557, ..., -0.531118, -0.135382, -0.059618],
       [ 2.002969,  1.717037,  1.647973, ...,  1.999238,  2.902166,  3.090634]])
Coordinates:
  * design_loc_params  (design_loc_params) <U9 'intercept' 'bio1' 'bio2' ...
    feature_allzero    (features) bool False False False False False False ...
  * features           (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

In [21]:
estimator.par_link_scale

<xarray.DataArray (design_scale_params: 6, features: 100)>
array([[ 1.610447,  0.769736,  1.529644, ...,  1.250262,  1.137096,  1.664887],
       [-0.707272, -0.610573, -0.16996 , ..., -0.600896, -0.805995, -0.953684],
       [ 0.707272,  0.610573,  0.16996 , ...,  0.600896,  0.805995,  0.953684],
       [-0.28793 , -0.485543, -0.855342, ...,  0.341852, -0.421477, -0.557091],
       [ 0.28793 ,  0.485543,  0.855342, ..., -0.341852,  0.421477,  0.557091],
       [-0.053626, -0.185137,  0.081975, ...,  0.022312, -0.423629, -0.621982]])
Coordinates:
  * design_scale_params  (design_scale_params) <U9 'intercept' 'bio1' 'bio2' ...
    feature_allzero      (features) bool False False False False False False ...
  * features             (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

### Check that constraints were met

These parameter sets should sum to zero for each gene.

In [26]:
np.max(np.sum(estimator.par_link_loc[1:5,:], axis=0))

<xarray.DataArray ()>
array(4.440892e-16)

In [27]:
np.max(np.sum(estimator.par_link_loc[3:5,:], axis=0))

<xarray.DataArray ()>
array(0.)

## Comparing the results with the simulated data:

In [36]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 2.13
Root mean squared deviation of scale:    0.83
