In [1]:
import os
import datetime
import numpy as np
import numpy.random
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

# Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

# Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=1000)
sim.generate()
X = sim.X

Superimpose library size effects on counts:

In [5]:
size_factors = numpy.random.normal(loc=1, scale=0.5, size=X.shape[0]) # draw random factors
size_factors[size_factors < 0.2] = 0.2 # threshold
X = np.round(X*np.repeat(np.expand_dims(size_factors, axis=1), axis=1, repeats=X.shape[1])) # scale counts and round

Check size factor scaling:

In [6]:
np.mean(np.sum(X, axis=1).values/np.mean(np.sum(X, axis=1)).values - size_factors)

-0.015730194737318067

In [7]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

Add continuous covariate to desing_loc so that location model cannot be perfectly initialized:

In [8]:
design_loc = sim.design_loc
design_loc = np.hstack([design_loc.values, np.expand_dims(numpy.random.normal(loc=1, scale=1, size=design_loc.shape[0]), axis=1)])

# Estimate the model

## With size factors

### Set up estimator

In [9]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=design_loc, 
    design_scale=sim.design_scale,
    size_factors=size_factors
)

In [10]:
estimator = glm.models.nb_glm.Estimator(
    input_data, 
    init_a='standard', 
    init_b='standard',
    batch_size=500
)
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Train

In [11]:
estimator.train_sequence('QUICK')

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True}]
Beginning with training sequence #1
Step: 1	loss: 9254.189149
Step: 2	loss: 9292.989550
Step: 3	loss: 9167.053270
Step: 4	loss: 9052.905853
Step: 5	loss: 8967.553832
Step: 6	loss: 8959.901128
Step: 7	loss: 8874.794233
Step: 8	loss: 8854.083242
Step: 9	loss: 8852.239760
Step: 10	loss: 8777.278185
Step: 11	loss: 8845.530370
Step: 12	loss: 8820.528019
Step: 13	loss: 8818.555588
Step: 14	loss: 8812.567318
Step: 15	loss: 8762.291693
Step: 16	loss: 8797.929816
Step: 17	loss: 8840.081402
Step: 18	loss: 8745.417729
Step: 19	loss: 8792.725467
Step: 20	loss: 8736.065210
Step: 21	loss: 8732.579664
Step: 22	loss: 8785.134898
Step: 23	loss: 8720.633028
Step: 24	loss: 8782.008826
Step: 25	loss: 8752.383262
Step: 26	loss: 8733.759706
Step: 27	loss: 8751.284042
Step: 28	loss: 8746.472003
Step: 29	loss: 8768.293886
Step

Step: 289	loss: 8712.210471
Step: 290	loss: 8724.768196
Step: 291	loss: 8699.936695
Step: 292	loss: 8708.175842
Step: 293	loss: 8709.959639
Step: 294	loss: 8709.060281
Step: 295	loss: 8677.341841
Step: 296	loss: 8748.233477
Step: 297	loss: 8702.330357
Step: 298	loss: 8711.757124
Step: 299	loss: 8697.965955
Step: 300	loss: 8732.572436
pval: 0.680317
Training sequence #1 complete


### Comparing the results with the simulated data:

Individual coefficients:

In [12]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc[:-1,:].values, sim.par_link_loc.values)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [13]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.05
Root mean squared deviation of scale:    0.08


## Without size factors

### Set up estimator

In [17]:
input_data = glm.models.nb_glm.InputData.new(
    data=X, 
    design_loc=sim.design_loc, 
    design_scale=sim.design_scale,
    size_factors=None
)

In [None]:
estimator = glm.models.nb_glm.Estimator(
    input_data, 
    init_a='standard', 
    init_b='standard',
    batch_size=500
)
estimator.initialize()

Using standard initialization for mean
Should train mu: True
Using standard initialization for dispersion
Should train r: True


### Train

In [None]:
estimator.train_sequence("QUICK")

### Comparing the results with the simulated data:

Individual coefficients:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

It is evident that the dispersion (the variance mdoel) is badly estimated if the size-factor are not accounted for as they represent unaccounted confoudning.