In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

%load_ext autoreload
%autoreload 2

## Import batchglm

In [2]:
import batchglm.api as glm

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Simulate some data

In [3]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [4]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[1164,  451,  420, ...,  427,  340,  504],
       [ 304,  956,  519, ..., 2253,  199,   74],
       [ 654,  492,   41, ...,  534,  693, 1223],
       ...,
       [1095, 1012,  414, ...,  734,  104,  578],
       [  47, 1305,  401, ...,  717,  284,  641],
       [ 657, 4382,  159, ..., 3425,  868,  745]])
Dimensions without coordinates: observations, features

In [5]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [6]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [7]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.228511,  6.248043,  6.269096, ...,  6.244167,  6.107023,  6.122493],
       [ 0.303925,  0.568468, -0.320744, ...,  0.601764,  0.299382, -0.111811],
       [-0.234226,  0.559691, -0.583611, ...,  0.357968, -0.2249  ,  0.524813],
       [ 0.231819,  0.692746,  0.102646, ...,  0.113211,  0.070394, -0.534436],
       [ 0.58999 ,  0.201177, -0.35127 , ...,  0.370526, -0.38107 ,  0.284424]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [8]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.300632,  0.474854,  0.510081, ...,  0.586243, -0.157439,  0.59703 ],
       [ 0.378329,  0.339236,  0.350916, ..., -0.11908 , -0.123917,  0.440371],
       [ 0.684067,  0.169802,  0.293268, ...,  0.232556,  0.073541,  0.315615],
       [ 0.118172, -0.324854, -0.309454, ..., -0.024326, -0.681732,  0.331696],
       [-0.492386,  0.223755,  0.396582, ..., -0.38257 ,  0.419458,  0.624573]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [9]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [10]:
estimator = glm.models.glm_nb.Estimator(input_data)
estimator.initialize()

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [11]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

This is not correct right now:
Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [12]:
estimator.train_sequence("AUTO")

INFO:tensorflow:Step: 0 loss: 734.853934 models converged 0
INFO:tensorflow:Step: 1 loss: 730.798488, converged 0 in 1.373 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 2 loss: 730.481257, converged 0 in 0.473 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 3 loss: 730.477827, converged 6 in 0.464 sec., updated 100, {f: 6, g: 1, x: 0}
INFO:tensorflow:Step: 4 loss: 730.477825, converged 92 in 0.641 sec., updated 93, {f: 85, g: 77, x: 48}
INFO:tensorflow:Step: 5 loss: 730.477825, converged 100 in 0.531 sec., updated 6, {f: 6, g: 6, x: 8}


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [13]:
estimator.a_var

ValueError: cannot add coordinates with new dimensions to a DataArray

In [None]:
estimator.par_link_scale

## Comparing the results with the simulated data:

Individual coefficients:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)