In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)

## Import batchglm

In [2]:
import batchglm.api as glm

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# just to ignore some tensorflow warnings; just ignore this line
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

## Simulate some data

In [4]:
sim = glm.models.nb_glm.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [5]:
sim.X

<xarray.DataArray 'X' (observations: 2000, features: 100)>
array([[ 1608, 23855,  3128, ...,  9093,  6313, 14811],
       [ 1311, 12293,   423, ...,  4980,  4481,  3952],
       [ 1532,  7536,  1744, ...,  5113,  1503,  4223],
       ...,
       [ 1076,  6685,  1879, ...,  5888, 13468,  3888],
       [ 1120, 54190,  4770, ...,  3767,  1320,  2426],
       [ 3860, 62971,  6525, ...,  7628, 11660, 10329]])
Dimensions without coordinates: observations, features

In [6]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

In [7]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.]])

### The parameters used to generate this data:

In [8]:
sim.par_link_loc

<xarray.DataArray 'a' (design_loc_params: 5, features: 100)>
array([[ 6.855062,  9.138636,  8.353598, ...,  8.767831,  8.6532  ,  8.658673],
       [ 0.318947,  0.119078,  0.019966, ...,  0.624813,  0.199808,  0.667541],
       [-0.412925,  0.546072,  0.328992, ..., -0.195601,  0.233483, -0.400711],
       [ 0.272392,  0.486437,  0.570515, ...,  0.408122,  0.444424,  0.091396],
       [ 0.664643,  0.347944, -0.088663, ..., -0.372122,  0.03084 ,  0.398537]])
Coordinates:
  * design_loc_params  (design_loc_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

In [9]:
sim.par_link_scale

<xarray.DataArray 'b' (design_scale_params: 5, features: 100)>
array([[ 6.931472e-01,  0.000000e+00,  0.000000e+00, ...,  2.079442e+00,
         1.098612e+00,  6.931472e-01],
       [-3.862667e-01, -3.841125e-01,  3.679323e-01, ...,  6.431932e-01,
         3.522942e-01,  3.748949e-01],
       [ 2.751912e-01,  5.954063e-01,  5.339719e-01, ...,  6.800600e-01,
         5.530371e-01, -5.404328e-02],
       [ 3.720953e-04, -6.677351e-01, -1.369281e-01, ...,  4.281579e-01,
         5.010404e-01,  3.716122e-01],
       [ 3.143021e-01,  5.829746e-01,  6.737592e-01, ...,  2.378922e-01,
         4.735500e-01,  9.865651e-02]])
Coordinates:
  * design_scale_params  (design_scale_params) <U14 'Intercept' 'batch[T.1]' ...
Dimensions without coordinates: features

## Estimate the model

In [10]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.nb_glm.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [11]:
estimator = glm.models.nb_glm.Estimator(input_data)
estimator.initialize()

AttributeError: 'InputData' object has no attribute '_constraints_loc'

### Now train:

There are multiple possible training strategies:

In [None]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

In [None]:
pprint.pprint(estimator.TrainingStrategy.DEFAULT.value)

Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [None]:
estimator.train_sequence("AUTO")

## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [None]:
estimator.par_link_loc

In [None]:
estimator.par_link_scale

## Comparing the results with the simulated data:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)