In [1]:
import os
import datetime
import numpy as np
import pprint

import logging
import warnings

%load_ext autoreload
%autoreload 2

## Import batchglm

In [2]:
import batchglm.api as glm

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Simulate some data

In [3]:
sim = glm.models.glm_nb.Simulator(num_features=100)
sim.generate()

### Simulated model data:

In [4]:
sim.X

<xarray.DataArray 'X' (observations: 1000, features: 100)>
array([[ 389,  866,  817, ...,   21,  311,  507],
       [ 175,   79, 2425, ...,  695,  115, 1407],
       [ 512,  192,  706, ...,  638,  305,  391],
       ...,
       [ 217,  237,  752, ...,   31,   21, 1340],
       [ 411,  851, 1105, ...,  151, 1066,  326],
       [  35,  429, 1331, ...,  430,  527,  445]])
Dimensions without coordinates: observations, features

In [5]:
np.unique(sim.design_loc, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

In [6]:
np.unique(sim.design_scale, axis=0)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0.],
       [1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [1., 1., 1., 0., 0.]])

### The parameters used to generate this data:

In [7]:
sim.par_link_loc

<xarray.DataArray (design_loc_params: 5, features: 100)>
array([[ 6.222576,  6.226537,  6.20859 , ...,  6.204558,  6.150603,  6.240276],
       [-0.473377, -0.642484,  0.469219, ...,  0.201821, -0.653018,  0.511569],
       [-0.194697,  0.457161,  0.053068, ...,  0.533112, -0.02738 ,  0.58755 ],
       [ 0.093967,  0.524487,  0.640611, ...,  0.653832,  0.524259, -0.120332],
       [-0.235269,  0.249332, -0.299407, ...,  0.594625,  0.107086, -0.237646]])
Coordinates:
  * design_loc_params  (design_loc_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

In [8]:
sim.par_link_scale

<xarray.DataArray (design_scale_params: 5, features: 100)>
array([[ 0.655315,  0.378489,  0.038847, ..., -0.296621, -0.199702,  0.48456 ],
       [ 0.007845, -0.205828, -0.300329, ..., -0.622069, -0.254032, -0.486774],
       [ 0.431506, -0.057484,  0.671505, ..., -0.632874,  0.273627,  0.254541],
       [ 0.09804 ,  0.615998,  0.479533, ...,  0.684174,  0.367564,  0.555351],
       [-0.420474,  0.402405,  0.68731 , ..., -0.656742, -0.359149,  0.649688]])
Coordinates:
  * design_scale_params  (design_scale_params) object 'Intercept' ... 'batch[T.3]'
Dimensions without coordinates: features

## Estimate the model

In [9]:
X = sim.X
design_loc = sim.design_loc
design_scale = sim.design_scale

# input data
input_data = glm.models.glm_nb.InputData.new(data=X, design_loc=design_loc, design_scale=design_scale)

### set up estimator:

In [10]:
estimator = glm.models.glm_nb.Estimator(input_data)
estimator.initialize()

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Now train:

There are multiple possible training strategies:

In [11]:
for i in estimator.TrainingStrategy:
    print(i.name)

Each one of them corresponds to a list of training options which will be passed to the estimator.train() function:

This is not correct right now:
Therefore, when choosing the training strategy "DEFAULT", the following call:
```python
estimator.train_sequence("DEFAULT")
```
is equal to:
```python
estimator.train_sequence(estimator.TrainingStrategy.DEFAULT)
```
is equal to:
```python
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.1,
    loss_window_size = 100,
    optim_algo = 'ADAM',
    stop_at_loss_change = 0.05,
    use_batching = True
)
estimator.train(
    convergence_criteria = 't_test',
    learning_rate = 0.05,
    loss_window_size = 10,
    optim_algo = 'GD',
    stop_at_loss_change = 0.05,
    use_batching = False
)
```

Now start the training sequence and let the estimator choose automatically the best training strategy:

In [19]:
estimator.train_sequence("AUTO")

INFO:tensorflow:Step: 0 loss: 733.262273 models converged 0
INFO:tensorflow:Step: 1 loss: 729.109345, converged 0 in 0.554 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 2 loss: 728.679206, converged 0 in 0.153 sec., updated 100, {f: 0, g: 0, x: 0}
INFO:tensorflow:Step: 3 loss: 728.671717, converged 3 in 0.147 sec., updated 100, {f: 3, g: 0, x: 0}
INFO:tensorflow:Step: 4 loss: 728.671706, converged 94 in 0.158 sec., updated 94, {f: 88, g: 69, x: 46}
INFO:tensorflow:Step: 5 loss: 728.671706, converged 100 in 0.151 sec., updated 6, {f: 6, g: 6, x: 3}


## Obtaining the results

The fitted parameters can be retrieved by calling the corresponding parameters of `estimator`:

In [27]:
estimator.a_var

ValueError: cannot add coordinates with new dimensions to a DataArray

In [21]:
estimator.par_link_scale

ValueError: cannot add coordinates with new dimensions to a DataArray

## Comparing the results with the simulated data:

Individual coefficients:

In [22]:
locdiff = glm.utils.stats.rmsd(estimator.par_link_loc, sim.par_link_loc)
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(estimator.par_link_scale, sim.par_link_scale)
print("Root mean squared deviation of scale:    %.2f" % scalediff)

ValueError: cannot add coordinates with new dimensions to a DataArray

Linear model output:

In [None]:
locdiff = glm.utils.stats.rmsd(np.matmul(estimator.design_loc, estimator.par_link_loc), 
                               np.matmul(sim.design_loc, sim.par_link_loc))
print("Root mean squared deviation of location: %.2f" % locdiff)

scalediff = glm.utils.stats.rmsd(np.matmul(estimator.design_scale, estimator.par_link_scale), 
                                 np.matmul(sim.design_scale, sim.par_link_scale))
print("Root mean squared deviation of scale:    %.2f" % scalediff)