In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

%load_ext autoreload
%autoreload 2

## Import batchglm

In [2]:
import batchglm.api as glm

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Simulate some data

In [3]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [4]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[ 242,  829,  148, ...,  623,  370,   24],
       [ 559,  401, 1094, ...,  263, 1321,  786],
       [ 324,  568,   91, ...,  303,  231,   18],
       ...,
       [3194,  526,  384, ...,  265,   22,  557],
       [ 190,   16,  150, ...,   11,  241,  247],
       [7682,  626,   79, ...,  120,  178,   39]])
Dimensions without coordinates: observations, features

In [5]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [6]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [7]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.248043,  6.249975,  6.173786, ...,  6.25575 ,  6.156979,  6.216606],
       [ 0.274754,  0.594328,  0.222326, ...,  0.030784,  0.425086,  0.681651],
       [ 0.171424,  0.619947,  0.458313, ...,  0.543494,  0.30293 , -0.37049 ],
       [ 0.628701,  0.016404, -0.309194, ...,  0.499802, -0.473083,  0.341887],
       [ 0.635263, -0.475037, -0.207962, ..., -0.484673, -0.603593, -0.341515]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [8]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.509944,  0.626992, -0.539249, ...,  0.556744, -0.217805,  0.176009],
       [ 0.252749,  0.455345,  0.577451, ...,  0.657897,  0.487069, -0.441436],
       [-0.464174,  0.357717, -0.105022, ...,  0.131842,  0.487984, -0.186867],
       [ 0.676445, -0.123315,  0.581966, ...,  0.590098,  0.208276,  0.159575],
       [-0.669824,  0.368721,  0.431654, ...,  0.271198, -0.065935,  0.548516]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [9]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [10]:
estimator = glm.models.glm_nb.Estimator(input_data)
estimator.initialize()

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [11]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

This is not correct right now:
Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [12]:
estimator.train_sequence("AUTO")

INFO:tensorflow:Step: 0 loss: 731.020506 models converged 0
INFO:tensorflow:Step: 1 loss: 726.528533, converged 0 in 0.955 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 2 loss: 726.095678, converged 0 in 0.439 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 3 loss: 726.089434, converged 7 in 0.461 sec., updated 100, {f: 7, g: 3, x: 0}
INFO:tensorflow:Step: 4 loss: 726.089429, converged 92 in 0.412 sec., updated 90, {f: 82, g: 70, x: 41}
INFO:tensorflow:Step: 5 loss: 726.089429, converged 100 in 0.418 sec., updated 8, {f: 8, g: 8, x: 8}


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [13]:
estimator.a_var

ValueError: cannot add coordinates with new dimensions to a DataArray

In [None]:
estimator.par_link_scale

## Comparing the results with the simulated data:

Individual coefficients:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)