In [1]:
import os
import datetime
import numpy as np
import xarray as xa
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.DEBUG)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=100)

In [5]:
sim.generate_sample_description(num_batches=2, num_confounders=4)
sim.generate_params()
sim.generate_data()

### Simulated model data:

In [6]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[ 9598,  4652,   319, ...,  6282,  1708,  2095],
       [13948,  4938,   365, ...,  2417,  3692,  1040],
       [ 8743,  8356,  1578, ...,  9620,  3276,  1056],
       ...,
       [ 8598,  3504,  1973, ...,  1451,  5367,  1597],
       [ 6778,  3215,  1380, ..., 17942,  4649,  3053],
       [10059,  8003,  1708, ..., 10895,  4325,  1732]])
Dimensions without coordinates: observations, features

In [7]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [8]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 5, features: 100)>
array([[ 9.196003e+00,  8.491143e+00,  6.852566e+00, ...,  8.637595e+00,
         8.003287e+00,  7.287410e+00],
       [-2.187385e-01, -1.766351e-01,  1.469890e-02, ...,  9.231006e-02,
         8.961836e-02,  1.568556e-03],
       [ 1.995042e-01, -1.144684e-01,  2.361525e-01, ..., -6.698329e-01,
         3.360822e-01, -1.045535e-01],
       [ 3.163526e-01,  2.160538e-01,  3.013061e-01, ...,  5.993177e-01,
         3.107750e-01,  6.100819e-01],
       [ 6.569366e-01,  5.965733e-01,  1.312351e-01, ...,  5.535659e-01,
         1.333506e-01,  5.455421e-01]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

In [9]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 5, features: 100)>
array([[ 1.791759,  2.302585,  1.386294, ...,  1.098612,  2.197225,  1.609438],
       [ 0.0063  ,  0.229879,  0.669902, ...,  0.246405, -0.399035,  0.033967],
       [ 0.689788, -0.296096,  0.421386, ...,  0.598311,  0.070376,  0.4149  ],
       [ 0.562711, -0.469413,  0.460855, ...,  0.146805,  0.288761, -0.098345],
       [-0.676518,  0.299728,  0.318685, ...,  0.373308, -0.391379,  0.452296]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

## Constraints for model

In [43]:
dmat_est_loc = sim.design_loc

In [46]:
dmat_est_scale = sim.design_scale

In [47]:
constraints_loc = np.zeros([2, dmat_est_loc.shape[1]])
constraints_loc[0,1] = -1
constraints_loc[0,2:5] = 1
constraints_loc[1,1:3] = -1
constraints_loc[1,3:5] = 1
constraints_loc

array([[ 0., -1.,  1.,  1.,  1.],
       [ 0., -1., -1.,  1.,  1.]])

In [48]:
constraints_scale = np.zeros([2, dmat_est_scale.shape[1]])
constraints_scale[0,1] = -1
constraints_scale[0,2:5] = 1
constraints_scale[1,1:3] = -1
constraints_scale[1,3:5] = 1
constraints_scale

array([[ 0., -1.,  1.,  1.,  1.],
       [ 0., -1., -1.,  1.,  1.]])

In [49]:
from numpy.linalg import matrix_rank
print(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc]))
print("rank deficiency without constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0)]))))
print("rank deficiency with constraints: "+ str(dmat_est_loc.shape[1] - matrix_rank(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc]))))
np.linalg.inv(np.vstack([np.unique(dmat_est_loc, axis=0), constraints_loc]))

[[ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  1.]
 [ 1.  0.  0.  1.  0.]
 [ 1.  0.  1.  0.  0.]
 [ 1.  1.  0.  0.  0.]
 [ 1.  1.  0.  0.  1.]
 [ 1.  1.  0.  1.  0.]
 [ 1.  1.  1.  0.  0.]
 [ 0. -1.  1.  1.  1.]
 [ 0. -1. -1.  1.  1.]]
rank deficiency without constraints: 0
rank deficiency with constraints: 0


LinAlgError: Last 2 dimensions of the array must be square

## Estimate the model

In [15]:
input_data = None

In [16]:
X = sim.X
design_loc = dmat_est_loc
design_scale = dmat_est_scale

# input data
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc,
    design_scale=design_scale)
input_data.constraints_loc = constraints_loc
input_data.constraints_scale = constraints_scale

In [17]:
input_data.constraints_loc = constraints_loc
input_data.constraints_scale = constraints_scale

### set up estimator:

In [18]:
estimator = glm.models.nb_glm.Estimator(input_data, quick_scale=False)
estimator.initialize()

Using closed-form MLE initialization for mean
RMSE of closed-form mean:
[]
Should train mu: False
Using closed-form MME initialization for dispersion
RMSE of closed-form dispersion:
[]
Should train r: True


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Graph was finalized.
Running local_init_op.
Done running local_init_op.


### train

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [19]:
estimator.train_sequence('AUTO')

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.01,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.25,
  'use_batching': False}]
Beginning with training sequence #1
Step: 1	loss: 909.218686
Step: 2	loss: 909.113620
Step: 3	loss: 909.018105
Step: 4	loss: 908.931312
Step: 5	loss: 908.852508
Step: 6	loss: 908.781084
Step: 7	loss: 908.716429
Step: 8	loss: 908.657927
Step: 9	loss: 908.605016
Step: 10	loss: 908.557184
Step: 11	loss: 908.513951
Step: 12	loss: 908.474877
Step: 13	loss: 908.439576
Step: 14	loss: 908.407705
Step: 15	loss: 908.378943
Step: 16	loss: 908.352983
Step: 17	loss: 908.329553
Step: 18	loss: 908.308419
Step: 19	loss: 908.289373
Step: 20	loss: 908.272216
pval: 0.000017
Step: 21	loss: 908.256747
Step: 22	loss: 908.242765
Step: 23	loss: 908.230074
Step: 24	loss: 908.218499
Step: 25	loss: 908.207886
Step: 26	loss: 908.198100
Step: 27	loss: 908.189024
Step: 28	loss: 908.180565
Step: 29	loss: 908.172652
Step: 30	loss: 90

## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [20]:
estimator.par_link_loc

<xarray.DataArray (design_loc_params: 6, features: 100)>
array([[ 9.500858e+00,  8.694280e+00,  7.002476e+00, ...,  8.857478e+00,
         8.201733e+00,  7.587812e+00],
       [-2.170482e-02, -7.155830e-03, -2.760999e-02, ..., -2.847717e-02,
         2.061314e-02, -2.449700e-02],
       [-4.340965e-02, -1.431166e-02, -5.521997e-02, ..., -5.695434e-02,
         4.122628e-02, -4.899400e-02],
       [ 4.298312e-02,  1.208923e-02, -1.295364e-03, ...,  8.687255e-03,
         6.952673e-03,  2.610220e-02],
       [-2.127830e-02, -4.933397e-03,  2.890535e-02, ...,  1.978991e-02,
        -2.756581e-02, -1.605199e-03],
       [-2.139402e-01, -1.742721e-01,  2.632533e-02, ...,  8.565863e-02,
         9.926631e-02,  3.642646e-03]])
Coordinates:
  * design_loc_params  (design_loc_params) <U9 'intercept' 'bio1' 'bio2' ...
    feature_allzero    (features) bool False False False False False False ...
  * features           (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

In [21]:
estimator.par_link_scale

<xarray.DataArray (design_scale_params: 6, features: 100)>
array([[ 1.217465,  1.538154,  1.772653, ...,  0.796824,  2.115584,  1.116638],
       [ 0.108818,  0.046557, -0.044049, ...,  0.005485, -0.040011,  0.061176],
       [ 0.217637,  0.093114, -0.088099, ...,  0.01097 , -0.080022,  0.122352],
       [ 0.038498,  0.025622,  0.01294 , ..., -0.147301, -0.101545,  0.018101],
       [-0.147316, -0.072179,  0.031109, ...,  0.141816,  0.141556, -0.079277],
       [ 0.352197,  0.281237,  0.383562, ...,  0.057901, -0.443884,  0.263469]])
Coordinates:
  * design_scale_params  (design_scale_params) <U9 'intercept' 'bio1' 'bio2' ...
    feature_allzero      (features) bool False False False False False False ...
  * features             (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

### Check that constraints were met

These parameter sets should sum to zero for each gene.

In [22]:
np.sum(estimator.par_link_loc[1:5,:], axis=0)

<xarray.DataArray (features: 100)>
array([-0.04341 , -0.014312, -0.05522 ,  0.015561,  0.033623, -0.046711,
        0.107139,  0.02646 ,  0.048112,  0.011796, -0.073012,  0.001583,
       -0.080625, -0.022484,  0.009809, -0.090137, -0.036012,  0.046344,
       -0.031822, -0.017174,  0.065789,  0.099932,  0.030154, -0.068511,
        0.055571,  0.056106,  0.008257, -0.022975,  0.001068,  0.069764,
        0.013374, -0.102713,  0.015313, -0.059028, -0.014516,  0.001358,
       -0.013114,  0.001527,  0.069074, -0.050366,  0.020922, -0.026936,
        0.024557,  0.023731, -0.03017 ,  0.007878, -0.031901,  0.046674,
       -0.019948,  0.049765, -0.03643 ,  0.034322,  0.034613,  0.007805,
        0.06723 , -0.080218,  0.0105  , -0.241046, -0.007088, -0.022811,
        0.046027,  0.086787,  0.056503, -0.08825 , -0.117307, -0.045259,
       -0.047805, -0.389447,  0.00591 , -0.007536, -0.039915,  0.048071,
       -0.165247, -0.032978, -0.158107, -0.052858, -0.015417,  0.018587,
       -0.013339

In [23]:
np.max(estimator.par_link_loc[1,:]+np.sum(estimator.par_link_loc[2:5,:], axis=0))

<xarray.DataArray ()>
array(0.150556)
Coordinates:
    design_loc_params  <U9 'bio1'

In [24]:
np.max(np.sum(estimator.par_link_loc[1:3,:], axis=0)+np.sum(estimator.par_link_loc[3:5,:], axis=0))

<xarray.DataArray ()>
array(0.150556)

## Comparing the results with the simulated data:

Individual coefficients:

In [25]:
estimator.par_link_loc[:,:3]

<xarray.DataArray (design_loc_params: 6, features: 3)>
array([[ 9.500858e+00,  8.694280e+00,  7.002476e+00],
       [-2.170482e-02, -7.155830e-03, -2.760999e-02],
       [-4.340965e-02, -1.431166e-02, -5.521997e-02],
       [ 4.298312e-02,  1.208923e-02, -1.295364e-03],
       [-2.127830e-02, -4.933397e-03,  2.890535e-02],
       [-2.139402e-01, -1.742721e-01,  2.632533e-02]])
Coordinates:
  * design_loc_params  (design_loc_params) <U9 'intercept' 'bio1' 'bio2' ...
    feature_allzero    (features) bool False False False
  * features           (features) int64 0 1 2

In [26]:
sim.par_link_loc[:,:3]

<xarray.DataArray 'a' (design_loc_params: 5, features: 3)>
array([[ 9.196003,  8.491143,  6.852566],
       [-0.218738, -0.176635,  0.014699],
       [ 0.199504, -0.114468,  0.236152],
       [ 0.316353,  0.216054,  0.301306],
       [ 0.656937,  0.596573,  0.131235]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

In [39]:
np.matmul(estimator.design_loc, estimator.par_link_loc)[:10,:5]

array([[9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889],
       [9.47915345, 8.68712424, 6.97486621, 7.63937715, 9.33286889]])

In [41]:
np.matmul(sim.design_loc, sim.par_link_loc)[:10,:5]

array([[9.19600273, 8.49114266, 6.8525657 , 7.55697204, 9.11584935],
       [9.39550692, 8.3766743 , 7.08871818, 7.19990695, 9.15972565],
       [9.51235529, 8.70719642, 7.15387178, 8.13880293, 8.9451711 ],
       [9.85293933, 9.08771593, 6.98380081, 7.3532452 , 9.73717642],
       [9.19600273, 8.49114266, 6.8525657 , 7.55697204, 9.11584935],
       [9.39550692, 8.3766743 , 7.08871818, 7.19990695, 9.15972565],
       [9.51235529, 8.70719642, 7.15387178, 8.13880293, 8.9451711 ],
       [9.85293933, 9.08771593, 6.98380081, 7.3532452 , 9.73717642],
       [9.19600273, 8.49114266, 6.8525657 , 7.55697204, 9.11584935],
       [9.39550692, 8.3766743 , 7.08871818, 7.19990695, 9.15972565]])

In [29]:
(np.matmul(estimator.design_loc, estimator.par_link_loc) - np.matmul(sim.design_loc, sim.par_link_loc))[:3,:5]

array([[ 0.28315071,  0.19598158,  0.12230051,  0.08240511,  0.21701953],
       [ 0.08364652,  0.31044994, -0.11385197,  0.43947019,  0.17314324],
       [-0.03320185, -0.02007218, -0.17900557, -0.49942578,  0.38769779]])

Linear model output:

In [30]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.30
Root mean squared deviation of scale:    0.60
