In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

%load_ext autoreload
%autoreload 2

## Import batchglm

In [2]:
import batchglm.api as glm

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Simulate some data

In [3]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [4]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[1310,  783,   35, ...,  229,  217,  283],
       [1056,  589,  368, ...,   16,  101,   39],
       [ 620,  734,  546, ...,    0,  536,  101],
       ...,
       [1126, 1921,  738, ...,  134,    1,  125],
       [ 828, 1255,  650, ...,   79,   82,  236],
       [1520,  861, 1182, ...,  112,   21,  404]])
Dimensions without coordinates: observations, features

In [5]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [6]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [7]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.306275e+00,  6.244167e+00,  6.214608e+00, ...,  6.267201e+00,
         6.200509e+00,  6.282267e+00],
       [ 4.150898e-01,  2.989900e-01, -8.960187e-02, ...,  3.348734e-01,
         3.808046e-01, -3.055372e-01],
       [ 5.226297e-01, -1.717118e-01,  4.361976e-01, ..., -8.599905e-03,
         3.547844e-01,  2.666716e-01],
       [-3.557619e-03,  1.538305e-01,  2.388673e-01, ...,  6.926696e-01,
         1.445147e-01,  1.726720e-01],
       [ 8.409922e-02,  5.777250e-01,  5.055840e-01, ..., -5.594666e-01,
         4.193475e-01,  3.187674e-01]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [8]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.210112,  0.543395,  0.617367, ..., -0.412905, -0.451211,  0.58719 ],
       [ 0.571239,  0.453361,  0.271927, ..., -0.112064, -0.266357, -0.222001],
       [ 0.409294,  0.532595,  0.168298, ...,  0.656426,  0.25236 , -0.152015],
       [-0.043578, -0.307193,  0.464835, ...,  0.185886,  0.440739,  0.525042],
       [ 0.646874, -0.569399,  0.68175 , ...,  0.065997,  0.175608,  0.495903]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [9]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [10]:
estimator = glm.models.glm_nb.Estimator(input_data)
estimator.initialize()

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [11]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

This is not correct right now:
Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [12]:
estimator.train_sequence("AUTO")

INFO:tensorflow:Step: 0 loss: 735.288950 models converged 0
INFO:tensorflow:Step: 1 loss: 731.097275, converged 0 in 0.938 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 2 loss: 730.722463, converged 0 in 0.427 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 3 loss: 730.716881, converged 6 in 0.42 sec., updated 100, {f: 6, g: 1, x: 0}
INFO:tensorflow:Step: 4 loss: 730.716875, converged 91 in 0.414 sec., updated 91, {f: 82, g: 74, x: 50}
INFO:tensorflow:Step: 5 loss: 730.716875, converged 100 in 0.421 sec., updated 9, {f: 9, g: 9, x: 9}


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [13]:
estimator.a_var

ValueError: cannot add coordinates with new dimensions to a DataArray

In [None]:
estimator.par_link_scale

## Comparing the results with the simulated data:

Individual coefficients:

In [None]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)