In [1]:
import os
import datetime
import numpy as np
import numpy.random
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

# Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

# Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=1000)
sim.generate()
X = sim.X

Superimpose library size effects on counts:

In [5]:
size_factors = numpy.random.normal(loc=1, scale=0.5, size=X.shape[0]) # draw random factors
size_factors[size_factors < 0.2] = 0.2 # threshold
X = np.round(X*np.repeat(np.expand_dims(size_factors, axis=1), axis=1, repeats=X.shape[1])) # scale counts and round

Check size factor scaling:

In [6]:
np.mean(np.sum(X, axis=1).values/np.mean(np.sum(X, axis=1)).values - size_factors)

-0.017922069110753173

In [7]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

Add continuous covariate to desing_loc so that location model cannot be perfectly initialized:

In [8]:
design_loc = sim.design_loc
design_loc = np.hstack([design_loc.values, np.expand_dims(numpy.random.normal(loc=1, scale=1, size=design_loc.shape[0]), axis=1)])

# Estimate the model

## With size factors

### Set up estimator

In [9]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc, 
    design_scale=sim.design_scale,
    size_factors=size_factors
)

In [10]:
estimator = glm.models.nb_glm.Estimator(
    input_data, 
    init_a='standard', 
    init_b='standard',
    batch_size=500
)
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Train

In [11]:
estimator.train_sequence('QUICK')

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True}]
Beginning with training sequence #1
Step: 1	loss: 9310.697198
Step: 2	loss: 9286.242484
Step: 3	loss: 9210.827453
Step: 4	loss: 9079.162826
Step: 5	loss: 9024.435197
Step: 6	loss: 8978.609955
Step: 7	loss: 8883.524098
Step: 8	loss: 8919.163054
Step: 9	loss: 8888.721216
Step: 10	loss: 8873.238280
Step: 11	loss: 8857.346668
Step: 12	loss: 8840.651387
Step: 13	loss: 8819.011068
Step: 14	loss: 8837.200998
Step: 15	loss: 8849.620234
Step: 16	loss: 8835.735092
Step: 17	loss: 8794.641498
Step: 18	loss: 8842.283250
Step: 19	loss: 8788.239622
Step: 20	loss: 8830.757182
Step: 21	loss: 8792.120407
Step: 22	loss: 8798.450208
Step: 23	loss: 8775.659502
Step: 24	loss: 8801.616521
Step: 25	loss: 8805.316500
Step: 26	loss: 8752.636973
Step: 27	loss: 8776.755684
Step: 28	loss: 8797.421302
Step: 29	loss: 8750.314173
Step

Step: 289	loss: 8669.915697
Step: 290	loss: 8780.410298
Step: 291	loss: 8767.295765
Step: 292	loss: 8780.389245
Step: 293	loss: 8729.448167
Step: 294	loss: 8741.509567
Step: 295	loss: 8774.033526
Step: 296	loss: 8750.479377
Step: 297	loss: 8722.656853
Step: 298	loss: 8776.073892
Step: 299	loss: 8777.552791
Step: 300	loss: 8720.446490
pval: 0.615669
Training sequence #1 complete


### Comparing the results with the simulated data:

Individual coefficients:

In [12]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc[:-1,:].values, sim.par_link_loc.values)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [13]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.05
Root mean squared deviation of scale:    0.08


## Without size factors

### Set up estimator

In [14]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=sim.design_loc, 
    design_scale=sim.design_scale,
    size_factors=None
)

In [15]:
estimator = glm.models.nb_glm.Estimator(
    input_data, 
    init_a='standard', 
    init_b='standard',
    batch_size=500
)
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Train

In [16]:
estimator.train_sequence("QUICK")

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True}]
Beginning with training sequence #1
Step: 1	loss: 9470.138304
Step: 2	loss: 9458.059912
Step: 3	loss: 9373.541370
Step: 4	loss: 9334.954461
Step: 5	loss: 9306.730194
Step: 6	loss: 9347.867113
Step: 7	loss: 9289.773319
Step: 8	loss: 9306.822366
Step: 9	loss: 9282.921417
Step: 10	loss: 9336.342939
Step: 11	loss: 9308.485450
Step: 12	loss: 9272.548602
Step: 13	loss: 9276.389099
Step: 14	loss: 9292.443456
Step: 15	loss: 9290.597210
Step: 16	loss: 9282.899090
Step: 17	loss: 9301.267784
Step: 18	loss: 9259.684296
Step: 19	loss: 9292.685667
Step: 20	loss: 9274.766977
Step: 21	loss: 9279.150510
Step: 22	loss: 9289.743058
Step: 23	loss: 9267.081905
Step: 24	loss: 9262.336138
Step: 25	loss: 9247.386821
Step: 26	loss: 9306.014561
Step: 27	loss: 9265.024327
Step: 28	loss: 9271.668478
Step: 29	loss: 9230.510962
Step

Step: 289	loss: 9267.646973
Step: 290	loss: 9256.757971
Step: 291	loss: 9269.803401
Step: 292	loss: 9279.803906
Step: 293	loss: 9238.758431
Step: 294	loss: 9276.509319
Step: 295	loss: 9259.175140
Step: 296	loss: 9309.244444
Step: 297	loss: 9259.203673
Step: 298	loss: 9276.826329
Step: 299	loss: 9278.791471
Step: 300	loss: 9266.655260
pval: 0.489436
Training sequence #1 complete


### Comparing the results with the simulated data:

Individual coefficients:

In [17]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.05
Root mean squared deviation of scale:    0.48


Linear model output:

In [18]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.05
Root mean squared deviation of scale:    1.04


It is evident that the dispersion (the variance mdoel) is badly estimated if the size-factor are not accounted for as they represent unaccounted confoudning.