In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.DEBUG)

## Import batchglm

In [2]:
import batchglm.api as glm

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [5]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[ 8254,  3672,  7229, ...,  5280,  4970,  2834],
       [ 3572,  2617,  2694, ...,  7652, 14224,  7996],
       [ 5296,  1181,  7092, ..., 13695, 11411,  8449],
       ...,
       [18975, 27843,  1542, ..., 17310, 19900,  4840],
       [11797,  2158,  5167, ..., 14480, 12826,  6834],
       [31153,  2301,  1394, ..., 10736, 16736,  4827]])
Dimensions without coordinates: observations, features

In [6]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

In [7]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

### The parameters used to generate this data:

In [8]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 5, features: 100)>
array([[ 8.806215,  8.083441,  8.958677, ...,  8.969354,  9.082994,  8.942818],
       [ 0.115385,  0.650169,  0.676358, ...,  0.417252,  0.072147,  0.18195 ],
       [-0.337574,  0.362238,  0.591211, ..., -0.56396 ,  0.536657,  0.626756],
       [ 0.6179  ,  0.681902,  0.228461, ...,  0.215373,  0.615605, -0.14967 ],
       [ 0.07716 ,  0.439423, -0.636067, ...,  0.017376,  0.134838, -0.273974]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

In [9]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 5, features: 100)>
array([[ 2.197225,  0.      ,  1.791759, ...,  1.098612,  1.386294,  2.197225],
       [ 0.035024, -0.157254,  0.467061, ...,  0.54539 , -0.552892,  0.422423],
       [ 0.254288,  0.260412,  0.649432, ...,  0.105473, -0.117756, -0.043436],
       [ 0.453221, -0.189692, -0.017939, ..., -0.419644, -0.31896 ,  0.360347],
       [-0.277178, -0.259578, -0.107434, ...,  0.104903,  0.070054, -0.399334]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

## Estimate the model

In [10]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.nb_glm.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [11]:
estimator = glm.models.nb_glm.Estimator(input_data)
estimator.initialize()

Using closed-form MLE initialization for mean
RMSE of closed-form mean:
[1.21777045e-04 1.51540194e-03 5.47320002e-04 3.70358114e-03
 1.16360710e-03 1.26262986e-02 2.13019045e-04 6.97138934e-03
 1.77815910e-03 2.26710994e-04 1.15609775e-03 2.51648449e-04
 1.41579701e-03 4.93907164e-04 1.18967855e-04 6.93461858e-04
 1.18602169e-03 2.10866936e-03 3.20415021e-03 3.57594707e-04
 2.75920184e-03 3.46467232e-04 3.16634623e-03 1.12906172e-03
 4.49988536e-03 7.72149491e-03 1.13673603e-03 1.33255063e-03
 1.08078349e-04 5.88724731e-03 1.55051888e-02 1.04069028e-03
 3.37859656e-04 2.23879080e-03 4.55242838e-04 8.41768549e-03
 1.28355095e-03 5.97037823e-04 3.61962280e-03 6.94334802e-04
 4.93019376e-04 4.61909142e-04 1.13183544e-02 2.10359005e-03
 1.56217164e-03 4.51902069e-04 1.04927533e-03 1.56825573e-03
 2.05945749e-03 1.45147367e-03 6.77560596e-04 2.13429478e-03
 5.05332923e-04 1.00493856e-03 1.70805258e-03 2.54676581e-03
 1.21753341e-03 5.14141976e-03 2.55480790e-03 7.94424344e-04
 1.27736547e-

### Now train:

There are multiple possible training strategies:

In [12]:
for i in estimator.TrainingStrategy:
    print(i.name)

AUTO
DEFAULT
EXACT
QUICK
PRE_INITIALIZED


Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

In [13]:
pprint.pprint(estimator.TrainingStrategy.DEFAULT.value)

[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]


Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [14]:
estimator.train_sequence("AUTO")

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]
Beginning with training sequence #1
Step: 1	loss: 885.513913
Step: 2	loss: 898.976127
Step: 3	loss: 889.199063
Step: 4	loss: 890.597800
Step: 5	loss: 890.676163
Step: 6	loss: 890.348473
Step: 7	loss: 888.962466
Step: 8	loss: 888.290230
Step: 9	loss: 888.311687
Step: 10	loss: 889.301279
Step: 11	loss: 887.976396
Step: 12	loss: 887.533220
Step: 13	loss: 887.237896
Step: 14	loss: 887.413583
Step: 15	loss: 887.046304
Step: 16	loss: 886.387423
Step: 17	loss: 887.318333
Step: 18	loss: 887.345315
Step: 19	loss: 886.430518
Step: 20	loss: 885.893532
Step: 21	loss: 886.252933
Step: 22	loss: 887.000912
Step: 23	loss: 886.164973
Step: 24	loss: 885.51

Step: 294	loss: 885.970917
Step: 295	loss: 885.249642
Step: 296	loss: 885.652558
Step: 297	loss: 886.476882
Step: 298	loss: 885.302112
Step: 299	loss: 885.914032
Step: 300	loss: 886.021348
pval: 0.754031
Training sequence #1 complete
Beginning with training sequence #2
Step: 301	loss: 885.754197
Step: 302	loss: 888.480427
Step: 303	loss: 886.006512
Step: 304	loss: 886.239537
Step: 305	loss: 886.844229
Step: 306	loss: 886.370105
Step: 307	loss: 885.765668
Step: 308	loss: 885.759317
Step: 309	loss: 886.065760
Step: 310	loss: 886.104541
Step: 311	loss: 885.814132
Step: 312	loss: 885.549854
Step: 313	loss: 885.549775
Step: 314	loss: 885.693918
Step: 315	loss: 885.740031
Step: 316	loss: 885.641682
Step: 317	loss: 885.530796
Step: 318	loss: 885.506574
Step: 319	loss: 885.538244
Step: 320	loss: 885.544845
pval: 0.008181
Step: 321	loss: 885.503336
Step: 322	loss: 885.458179
Step: 323	loss: 885.447766
Step: 324	loss: 885.458214
Step: 325	loss: 885.457418
Step: 326	loss: 885.441182
Step: 327	los

Step: 580	loss: 885.341571
pval: 0.487771
Training sequence #2 complete


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [15]:
estimator.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 8.799687,  8.118509,  8.956213, ...,  8.942947,  9.032373,  8.936549],
       [ 0.134017,  0.537621,  0.698733, ...,  0.445298,  0.089941,  0.196693],
       [-0.338728,  0.34282 ,  0.580924, ..., -0.5689  ,  0.572242,  0.673857],
       [ 0.663064,  0.685352,  0.225317, ...,  0.242009,  0.690999, -0.132891],
       [ 0.078703,  0.474199, -0.641201, ...,  0.035527,  0.154744, -0.284305]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero    (features) bool False False False False False False ...
  * features           (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

In [16]:
estimator.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 2.159152, -0.021516,  1.802005, ...,  1.150372,  1.457607,  2.132369],
       [ 0.062396, -0.060727,  0.507236, ...,  0.408981, -0.545177,  0.453194],
       [ 0.205858,  0.271733,  0.537328, ..., -0.007023, -0.102717, -0.073561],
       [ 0.445841, -0.246   , -0.060423, ..., -0.507981, -0.412319,  0.505199],
       [-0.191027, -0.240782, -0.100181, ...,  0.192562,  0.01396 , -0.389601]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero      (features) bool False False False False False False ...
  * features             (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

## Comparing the results with the simulated data:

Individual coefficients:

In [17]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [18]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.07
