In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.DEBUG)

## Import batchglm

In [2]:
import batchglm.api as glm

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [5]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[   32,   416,  6105, ...,  8969,  1636,   663],
       [   23,   626, 10337, ...,  4622,  1413,   777],
       [   26,   508,  2510, ...,  8820,  2166,  1007],
       ...,
       [   30,   198,  8707, ...,  8151,  1100,   616],
       [   36,   156,  3285, ...,  8939,  1053,   625],
       [   38,   247,  8099, ...,  6965,  1415,   644]])
Dimensions without coordinates: observations, features

In [6]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

In [7]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

### The parameters used to generate this data:

In [8]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 5, features: 100)>
array([[ 3.453986,  5.866545,  8.830487, ...,  9.0674  ,  7.869775,  6.942665],
       [-0.485149, -0.186729,  0.520302, ...,  0.344557,  0.171454,  0.352811],
       [ 0.64733 ,  0.184957, -0.163139, ...,  0.385477,  0.190301,  0.375933],
       [ 0.314313, -0.173535, -0.202074, ...,  0.383422, -0.363319, -0.523276],
       [-0.333154, -0.132375,  0.374714, ..., -0.410275, -0.292073,  0.10605 ]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

In [9]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 5, features: 100)>
array([[ 2.302585,  1.386294,  1.94591 , ...,  1.098612,  2.302585,  2.079442],
       [-0.115073,  0.056259,  0.019004, ...,  0.462091,  0.305876, -0.525032],
       [ 0.161889,  0.680545,  0.508986, ..., -0.576983,  0.517051,  0.403571],
       [ 0.692426,  0.322277,  0.581509, ..., -0.212946,  0.136586,  0.521936],
       [ 0.305526,  0.166326,  0.28729 , ...,  0.581877,  0.06642 ,  0.420869]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

## Estimate the model

In [10]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.nb_glm.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [11]:
estimator = glm.models.nb_glm.Estimator(input_data)
estimator.initialize()

Using closed-form MLE initialization for mean
Should train mu: True
Using closed-form MME initialization for dispersion
Should train r: True
Graph was finalized.
Running local_init_op.
Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [12]:
for i in estimator.TrainingStrategy:
    print(i.name)

AUTO
DEFAULT
EXACT
QUICK
PRE_INITIALIZED


Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

In [13]:
pprint.pprint(estimator.TrainingStrategy.DEFAULT.value)

[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]


Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [14]:
estimator.train_sequence("AUTO")

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]
Beginning with training sequence #1
Step: 1	loss: 870.819283
Step: 2	loss: 886.954227
Step: 3	loss: 877.969386
Step: 4	loss: 879.416220
Step: 5	loss: 875.629420
Step: 6	loss: 878.777180
Step: 7	loss: 878.018419
Step: 8	loss: 876.890313
Step: 9	loss: 873.899672
Step: 10	loss: 875.953812
Step: 11	loss: 878.074777
Step: 12	loss: 876.909827
Step: 13	loss: 872.009427
Step: 14	loss: 874.448934
Step: 15	loss: 875.445427
Step: 16	loss: 875.966209
Step: 17	loss: 872.381678
Step: 18	loss: 873.777388
Step: 19	loss: 875.611500
Step: 20	loss: 875.068247
Step: 21	loss: 871.718600
Step: 22	loss: 874.416605
Step: 23	loss: 874.773107
Step: 24	loss: 873.96

Step: 294	loss: 873.554922
Step: 295	loss: 874.425155
Step: 296	loss: 874.733963
Step: 297	loss: 871.142162
Step: 298	loss: 873.126753
Step: 299	loss: 874.777332
Step: 300	loss: 874.216714
pval: 0.619993
Training sequence #1 complete
Beginning with training sequence #2
Step: 301	loss: 873.109344
Step: 302	loss: 876.143956
Step: 303	loss: 873.395818
Step: 304	loss: 873.607083
Step: 305	loss: 874.349209
Step: 306	loss: 873.857454
Step: 307	loss: 873.158390
Step: 308	loss: 873.155839
Step: 309	loss: 873.523994
Step: 310	loss: 873.544317
Step: 311	loss: 873.182074
Step: 312	loss: 872.904408
Step: 313	loss: 872.953067
Step: 314	loss: 873.129368
Step: 315	loss: 873.159188
Step: 316	loss: 873.032171
Step: 317	loss: 872.913430
Step: 318	loss: 872.898050
Step: 319	loss: 872.934936
Step: 320	loss: 872.935527
pval: 0.009443
Step: 321	loss: 872.886982
Step: 322	loss: 872.842937
Step: 323	loss: 872.838025
Step: 324	loss: 872.849736
Step: 325	loss: 872.845569
Step: 326	loss: 872.826084
Step: 327	los

## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [15]:
estimator.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 3.459502,  5.869586,  8.818344, ...,  9.101997,  7.861408,  6.944665],
       [-0.447363, -0.159992,  0.538026, ...,  0.336276,  0.183903,  0.351782],
       [ 0.663368,  0.178523, -0.150581, ...,  0.342808,  0.207144,  0.376436],
       [ 0.334399, -0.200837, -0.191472, ...,  0.388376, -0.373587, -0.510122],
       [-0.356504, -0.160303,  0.377781, ..., -0.429275, -0.298389,  0.101208]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero    (features) bool False False False False False False ...
  * features           (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

In [16]:
estimator.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 2.239549,  1.369592,  2.05534 , ...,  1.12565 ,  2.352958,  2.034455],
       [ 0.006636,  0.145781, -0.060395, ...,  0.463318,  0.317246, -0.464744],
       [ 0.278803,  0.754612,  0.341749, ..., -0.430314,  0.52776 ,  0.419028],
       [ 0.612083,  0.302963,  0.403214, ..., -0.22296 ,  0.194152,  0.457432],
       [ 0.318217,  0.142252,  0.34286 , ...,  0.518943,  0.036133,  0.461486]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero      (features) bool False False False False False False ...
  * features             (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

## Comparing the results with the simulated data:

Individual coefficients:

In [17]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [18]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.07
