In [11]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import batchglm

In [12]:
import batchglm.api as glm

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Simulate some data

In [13]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [14]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[ 232,  554,  292, ...,  115,  303,  761],
       [ 548,  357,  381, ...,  364,  672,  670],
       [ 518,  122,  628, ...,  480,  273,  187],
       ...,
       [ 294,  245,  585, ...,  754, 2158,  766],
       [1411,   19,  306, ...,  188,  544,  584],
       [ 108,   30,  477, ...,  535, 1001,  844]])
Dimensions without coordinates: observations, features

In [15]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [16]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [17]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.212606,  6.259581,  6.194405, ...,  6.214608,  6.202536,  6.126869],
       [ 0.008286, -0.66597 , -0.258192, ...,  0.250405,  0.592159,  0.492965],
       [ 0.237772,  0.496428, -0.083513, ...,  0.389086,  0.382781, -0.374124],
       [-0.361429,  0.110296, -0.619197, ...,  0.497508,  0.176477, -0.076254],
       [-0.485264,  0.482937,  0.451294, ..., -0.213904,  0.280149,  0.48521 ]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [18]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.628964,  0.380663, -0.180962, ...,  0.648771,  0.655523,  0.528559],
       [-0.475378, -0.065375,  0.649745, ...,  0.017356,  0.619811,  0.638862],
       [-0.045476,  0.019085,  0.525502, ..., -0.303837, -0.489012, -0.13816 ],
       [ 0.434237, -0.149537,  0.186041, ..., -0.120215,  0.439335, -0.632505],
       [-0.387223, -0.404339,  0.374715, ...,  0.584082,  0.346951,  0.249889]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [19]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [20]:
estimator = glm.models.glm_nb.Estimator(input_data)
estimator.initialize()

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [21]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

This is not correct right now:
Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [22]:
estimator.train_sequence("AUTO")

INFO:tensorflow:Step: 0 loss: 731.243285 models converged 0
INFO:tensorflow:Step: 1 loss: 727.064607, converged 0 in 0.932 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 2 loss: 726.678380, converged 0 in 0.412 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 3 loss: 726.672260, converged 12 in 0.435 sec., updated 100, {f: 12, g: 0, x: 0}
INFO:tensorflow:Step: 4 loss: 726.672252, converged 93 in 0.453 sec., updated 84, {f: 77, g: 69, x: 41}
INFO:tensorflow:Step: 5 loss: 726.672252, converged 100 in 0.458 sec., updated 6, {f: 6, g: 6, x: 5}


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [23]:
estimator.a_var

ValueError: cannot add coordinates with new dimensions to a DataArray

In [None]:
estimator.par_link_scale

## Comparing the results with the simulated data:

Individual coefficients:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)