In [1]:
import os
import datetime
import numpy as np
import numpy.random
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

# Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

# Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=1000)
sim.generate()
X = sim.X

Superimpose library size effects on counts:

In [5]:
size_factors = numpy.random.normal(loc=1, scale=0.5, size=X.shape[0]) # draw random factors
size_factors[size_factors < 0.2] = 0.2 # threshold
X = np.round(X*np.repeat(np.expand_dims(size_factors, axis=1), axis=1, repeats=X.shape[1])) # scale counts and round

Check size factor scaling:

In [6]:
np.mean(np.sum(X, axis=1).values/np.mean(np.sum(X, axis=1)).values - size_factors)

-0.036100913313054886

In [7]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

Add continuous covariate to desing loc:

In [8]:
design_loc = sim.design_loc
design_loc = np.hstack([design_loc.values, np.expand_dims(numpy.random.normal(loc=1, scale=1, size=design_loc.shape[0]), axis=1)])

# Estimate the model

## With size factors

### Set up estimator

In [9]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc, 
    design_scale=sim.design_scale,
    size_factors=size_factors
)

In [10]:
estimator = glm.models.nb_glm.Estimator(input_data, init_a='standard', init_b='standard')
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Train

In [11]:
estimator.train_sequence('QUICK')

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True}]
Beginning with training sequence #1
Step: 1	loss: 9349.023429
Step: 2	loss: 9348.346032
Step: 3	loss: 9305.726775
Step: 4	loss: 9262.448597
Step: 5	loss: 9203.921256
Step: 6	loss: 9200.577417
Step: 7	loss: 9051.681319
Step: 8	loss: 9059.095941
Step: 9	loss: 9047.526901
Step: 10	loss: 9038.011104
Step: 11	loss: 8959.259093
Step: 12	loss: 8918.269985
Step: 13	loss: 8930.100143
Step: 14	loss: 8938.436552
Step: 15	loss: 8917.905964
Step: 16	loss: 8840.571024
Step: 17	loss: 8885.640464
Step: 18	loss: 8886.688097
Step: 19	loss: 8818.634583
Step: 20	loss: 8872.522261
Step: 21	loss: 8877.302940
Step: 22	loss: 8847.278586
Step: 23	loss: 8826.096943
Step: 24	loss: 8840.475552
Step: 25	loss: 8833.890737
Step: 26	loss: 8843.111580
Step: 27	loss: 8853.530884
Step: 28	loss: 8823.257045
Step: 29	loss: 8832.820885
Ste

Step: 289	loss: 8754.385319
Step: 290	loss: 8804.990925
Step: 291	loss: 8770.171753
Step: 292	loss: 8805.411745
Step: 293	loss: 8800.644576
Step: 294	loss: 8723.383024
Step: 295	loss: 8796.725462
Step: 296	loss: 8814.324475
Step: 297	loss: 8786.904414
Step: 298	loss: 8752.297882
Step: 299	loss: 8781.539296
Step: 300	loss: 8814.298923
pval: 0.433574
Training sequence #1 complete


### Comparing the results with the simulated data:

Individual coefficients:

In [13]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc[:-1,:].values, sim.par_link_loc.values)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [14]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.07


## Without size factors

### Set up estimator

In [15]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=sim.design_loc, 
    design_scale=sim.design_scale,
    size_factors=None
)

In [20]:
estimator = glm.models.nb_glm.Estimator(input_data, init_a='standard', init_b='standard')
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Train

In [21]:
estimator.train_sequence("AUTO")

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]
Beginning with training sequence #1
Step: 1	loss: 9523.144088
Step: 2	loss: 9478.396673
Step: 3	loss: 9413.858916
Step: 4	loss: 9362.095233
Step: 5	loss: 9356.345579
Step: 6	loss: 9336.932951
Step: 7	loss: 9357.801661
Step: 8	loss: 9333.369691
Step: 9	loss: 9326.729463
Step: 10	loss: 9358.242468
Step: 11	loss: 9368.139463
Step: 12	loss: 9286.259570
Step: 13	loss: 9312.619368
Step: 14	loss: 9327.936684
Step: 15	loss: 9290.155924
Step: 16	loss: 9341.765085
Step: 17	loss: 9296.375203
Step: 18	loss: 9318.889652
Step: 19	loss: 9329.637868
Step: 20	loss: 9308.188994
Step: 21	loss: 9321.345349
Step: 22	loss: 9299.253636
Step: 23	loss: 9287.53579

Step: 283	loss: 9292.741937
Step: 284	loss: 9280.745204
Step: 285	loss: 9291.840491
Step: 286	loss: 9328.143576
Step: 287	loss: 9297.440166
Step: 288	loss: 9283.101632
Step: 289	loss: 9267.981149
Step: 290	loss: 9327.463614
Step: 291	loss: 9307.681920
Step: 292	loss: 9290.215485
Step: 293	loss: 9292.076662
Step: 294	loss: 9297.269904
Step: 295	loss: 9295.472917
Step: 296	loss: 9310.090950
Step: 297	loss: 9293.645186
Step: 298	loss: 9314.240113
Step: 299	loss: 9274.333341
Step: 300	loss: 9312.695285
pval: 0.567980
Training sequence #1 complete
Beginning with training sequence #2
Step: 301	loss: 9297.443807
Step: 302	loss: 9307.641087
Step: 303	loss: 9298.315078
Step: 304	loss: 9300.108662
Step: 305	loss: 9301.767079
Step: 306	loss: 9299.784890
Step: 307	loss: 9297.809597
Step: 308	loss: 9297.983685
Step: 309	loss: 9299.090942
Step: 310	loss: 9299.081192
Step: 311	loss: 9297.957163
Step: 312	loss: 9297.089310
Step: 313	loss: 9297.128566
Step: 314	loss: 9297.582223
Step: 315	loss: 9297.76

### Comparing the results with the simulated data:

Individual coefficients:

In [22]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.04
Root mean squared deviation of scale:    0.46


Linear model output:

In [23]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.05
Root mean squared deviation of scale:    1.05


In [None]:
It is evident that the dispersion (the variance mdoel) is badly estimated if the size-factor are not accounted for as they represent unaccounted confoudning.