In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.DEBUG)

## Import batchglm

In [2]:
import batchglm.api as glm

In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [5]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[ 592,  446,  472, ...,  686,  166,  460],
       [1040,  436,  545, ..., 1999, 1014,  548],
       [ 355, 1246, 1169, ...,  362,  111,  260],
       ...,
       [ 351,  487, 2142, ...,   43,  664, 1656],
       [ 836,  397, 1643, ...,  339,  729, 1277],
       [ 406,  752, 1903, ...,  136, 1616,  538]])
Dimensions without coordinates: observations, features

In [6]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [7]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [8]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.236370e+00,  6.249975e+00,  6.285998e+00, ...,  6.228511e+00,
         6.257668e+00,  6.326149e+00],
       [ 2.049018e-01,  2.693441e-01,  6.121345e-01, ...,  5.915393e-01,
         6.811870e-01, -4.860269e-03],
       [ 3.234056e-01,  1.514454e-02,  4.739485e-02, ...,  1.575983e-01,
         3.081887e-01,  5.101046e-01],
       [ 1.487480e-02,  7.693484e-03, -3.710693e-01, ..., -9.910821e-02,
         6.230951e-01,  5.508496e-02],
       [ 2.107061e-01,  2.657196e-02,  5.290659e-01, ..., -6.421812e-01,
        -8.659752e-02,  6.675474e-01]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [9]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.663148,  0.388968, -0.029529, ...,  0.533484,  0.523392,  0.330428],
       [ 0.430409,  0.641547,  0.270136, ..., -0.093337,  0.65314 ,  0.055318],
       [ 0.3044  ,  0.475201,  0.449178, ..., -0.258936,  0.582445, -0.088318],
       [ 0.487693,  0.435016,  0.588147, ...,  0.231171,  0.336771, -0.341528],
       [-0.576247, -0.144176,  0.43414 , ..., -0.504932,  0.433781,  0.677088]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [10]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

In [13]:
input_data

[batchglm.models.base_glm.input.InputData object at 0x7f0d4787c320]: data=<xarray.Dataset>
    Dimensions:              (design_loc_params: 5, design_scale_params: 5, features: 100, loc_params: 5, observations: 1000, scale_params: 5)
    Coordinates:
      * observations         (observations) int64 0 1 2 3 4 ... 995 996 997 998 999
        feature_allzero      (features) bool False False False ... False False False
      * features             (features) int64 0 1 2 3 4 5 6 ... 93 94 95 96 97 98 99
      * design_loc_params    (design_loc_params) object 'Intercept' ... 'batch[T.3]'
      * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
        loc_params           (design_loc_params) object 'Intercept' ... 'batch[T.3]'
        scale_params         (design_scale_params) object 'Intercept' ... 'batch[T.3]'
    Data variables:
        X                    (observations, features) int64 592 446 472 ... 1616 538
        design_loc           (observations, de

### set up estimator:

In [12]:
estimator = glm.models.glm_nb.Estimator(input_data)
#estimator.initialize()

TypeError: 'NoneType' object is not subscriptable

### Now train:

There are multiple possible training strategies:

In [12]:
for i in estimator.TrainingStrategy:
    print(i.name)

AUTO
DEFAULT
EXACT
QUICK
PRE_INITIALIZED


Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

In [13]:
pprint.pprint(estimator.TrainingStrategy.DEFAULT.value)

[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]


Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [14]:
estimator.train_sequence("AUTO")

training strategy:
[{'convergence_criteria': 't_test',
  'learning_rate': 0.1,
  'loss_window_size': 100,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': True},
 {'convergence_criteria': 't_test',
  'learning_rate': 0.05,
  'loss_window_size': 10,
  'optim_algo': 'ADAM',
  'stop_at_loss_change': 0.05,
  'use_batching': False}]
Beginning with training sequence #1
Step: 1	loss: 885.513913
Step: 2	loss: 898.976127
Step: 3	loss: 889.199063
Step: 4	loss: 890.597800
Step: 5	loss: 890.676163
Step: 6	loss: 890.348473
Step: 7	loss: 888.962466
Step: 8	loss: 888.290230
Step: 9	loss: 888.311687
Step: 10	loss: 889.301279
Step: 11	loss: 887.976396
Step: 12	loss: 887.533220
Step: 13	loss: 887.237896
Step: 14	loss: 887.413583
Step: 15	loss: 887.046304
Step: 16	loss: 886.387423
Step: 17	loss: 887.318333
Step: 18	loss: 887.345315
Step: 19	loss: 886.430518
Step: 20	loss: 885.893532
Step: 21	loss: 886.252933
Step: 22	loss: 887.000912
Step: 23	loss: 886.164973
Step: 24	loss: 885.51

Step: 294	loss: 885.970917
Step: 295	loss: 885.249642
Step: 296	loss: 885.652558
Step: 297	loss: 886.476882
Step: 298	loss: 885.302112
Step: 299	loss: 885.914032
Step: 300	loss: 886.021348
pval: 0.754031
Training sequence #1 complete
Beginning with training sequence #2
Step: 301	loss: 885.754197
Step: 302	loss: 888.480427
Step: 303	loss: 886.006512
Step: 304	loss: 886.239537
Step: 305	loss: 886.844229
Step: 306	loss: 886.370105
Step: 307	loss: 885.765668
Step: 308	loss: 885.759317
Step: 309	loss: 886.065760
Step: 310	loss: 886.104541
Step: 311	loss: 885.814132
Step: 312	loss: 885.549854
Step: 313	loss: 885.549775
Step: 314	loss: 885.693918
Step: 315	loss: 885.740031
Step: 316	loss: 885.641682
Step: 317	loss: 885.530796
Step: 318	loss: 885.506574
Step: 319	loss: 885.538244
Step: 320	loss: 885.544845
pval: 0.008181
Step: 321	loss: 885.503336
Step: 322	loss: 885.458179
Step: 323	loss: 885.447766
Step: 324	loss: 885.458214
Step: 325	loss: 885.457418
Step: 326	loss: 885.441182
Step: 327	los

Step: 580	loss: 885.341571
pval: 0.487771
Training sequence #2 complete


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [15]:
estimator.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 8.799687,  8.118509,  8.956213, ...,  8.942947,  9.032373,  8.936549],
       [ 0.134017,  0.537621,  0.698733, ...,  0.445298,  0.089941,  0.196693],
       [-0.338728,  0.34282 ,  0.580924, ..., -0.5689  ,  0.572242,  0.673857],
       [ 0.663064,  0.685352,  0.225317, ...,  0.242009,  0.690999, -0.132891],
       [ 0.078703,  0.474199, -0.641201, ...,  0.035527,  0.154744, -0.284305]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero    (features) bool False False False False False False ...
  * features           (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

In [16]:
estimator.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 2.159152, -0.021516,  1.802005, ...,  1.150372,  1.457607,  2.132369],
       [ 0.062396, -0.060727,  0.507236, ...,  0.408981, -0.545177,  0.453194],
       [ 0.205858,  0.271733,  0.537328, ..., -0.007023, -0.102717, -0.073561],
       [ 0.445841, -0.246   , -0.060423, ..., -0.507981, -0.412319,  0.505199],
       [-0.191027, -0.240782, -0.100181, ...,  0.192562,  0.01396 , -0.389601]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
    feature_allzero      (features) bool False False False False False False ...
  * features             (features) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ...

## Comparing the results with the simulated data:

Individual coefficients:

In [17]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.08


Linear model output:

In [18]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Root mean squared deviation of location: 0.03
Root mean squared deviation of scale:    0.07
