# Runtime Comparisons on Real Data

This notebook checks the performance of feature batching vs standard training. It makes use of data from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5746044/.

In [1]:
import os
import datetime
import numpy as np
import pprint
import matplotlib.pyplot as plt
import logging
import warnings
import scipy
import time
import pandas as pd

%load_ext autoreload
%autoreload 2

In [20]:
import batchglm.api as glm
import scanpy as sc

np.warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")
logging.getLogger("tensorflow").setLevel(logging.INFO)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

## Setup

### I/O Settings

In [3]:
in_dir = '/home/mario/PSC19/'
out_dir = '/home/mario/PSC19/'
matrix_file = 'riesenfeld17_il25nmu_counts.mtx'
annot_file = 'riesenfeld17_il25nmu_annot.csv'
genes_file = 'riesenfeld17_il25nmu_genes.csv'

### Parameters

In [4]:
plot_figures = True  # specify whether figures should be printed.
save_figures = False # specify whether figures should be saved.

quick_scale = True  # determine whether the variance model should be trained
autograd = False     # specfify whether to use closed form gradients or TensorFlow's built-in autograd

## Generate Input Data for BatchGLM

### Read in Counts Data

scipy's mmread function takes a while here, approx. 90 seconds.

In [5]:
x_orig = scipy.io.mmread(in_dir+matrix_file).tocsc()
x = x_orig.transpose().toarray()
x = x.astype(dtype='int')
print(x.shape)

(35670, 13711)


We take three samples with the highest 10000, 5000 and 1000 expressed genes:

In [6]:
counts = np.sum(x, axis=0)
counts_sorted = counts.argsort()
highest_10000 = counts_sorted[-10000:]
highest_5000 = counts_sorted[-5000:]
highest_1000 = counts_sorted[-1000:]

x_10000 = x[:,highest_10000]
x_5000 = x[:,highest_5000]
x_1000 = x[:,highest_1000]
print(x_10000.shape, x_5000.shape, x_1000.shape)

(35670, 10000) (35670, 5000) (35670, 1000)


### Downsampling

35670 observations are way too much to do general runtimes analyses at this point in time. We downsample to `n_obs` observations by taking random row indices from x.

In [7]:
n_obs = 10000
idx = np.random.choice(x.shape[0], n_obs, replace=False)
x = x[idx]
x_10000 = x_10000[idx]
x_5000 = x_5000[idx]
x_1000 = x_1000[idx]
print(x_10000.shape, x_5000.shape, x_1000.shape)

(10000, 10000) (10000, 5000) (10000, 1000)


### Create Design matrices

Read annotation csv data with pandas

In [8]:
annot_data = pd.read_csv(in_dir+annot_file)
annot_data= annot_data.loc[idx]
annot_data.head()

Unnamed: 0.1,Unnamed: 0,cell,sample,cond,rep
30907,NMU_rep1.TTCGGTCTCAGTGCAT,NMU_rep1.TTCGGTCTCAGTGCAT,NMU_rep1,NMU,rep1
27531,NMU_rep1.CATTATCCACATCCGG,NMU_rep1.CATTATCCACATCCGG,NMU_rep1,NMU,rep1
24888,NMU_IL25_rep2.GTGGGTCAGTTAAGTG,NMU_IL25_rep2.GTGGGTCAGTTAAGTG,NMU_IL25_rep2,NMU_IL25,rep2
31030,NMU_rep1.TTGTAGGTCGCGTAGC,NMU_rep1.TTGTAGGTCGCGTAGC,NMU_rep1,NMU,rep1
13180,control_rep2.CAGATCACAGTAGAGC,control_rep2.CAGATCACAGTAGAGC,control_rep2,control,rep2


In [9]:
annot_data.shape

(10000, 5)

In [10]:
annot_data['sample'].unique()

array(['NMU_rep1', 'NMU_IL25_rep2', 'control_rep2', 'control_rep1',
       'NMU_IL25_rep1', 'NMU_rep2', 'IL25_rep1', 'IL25_rep2'],
      dtype=object)

In [11]:
annot_data['cond'].unique()

array(['NMU', 'NMU_IL25', 'control', 'IL25'], dtype=object)

Data consists of a total of `n_obs` observations, with 4 conditions, each of which is given in two batches.

In [12]:
intercept = np.ones(n_obs)
cond_IL25 = np.array(annot_data['cond'] == 'IL25', dtype='int')
cond_NMU = np.array(annot_data['cond'] == 'NMU', dtype='int')
cond_NMU_IL25 = np.array(annot_data['cond'] == 'NMU_IL25', dtype='int')
#batch = np.array(annot_data['rep'] == 'rep2', dtype='int')

The design matrix can be given by an intercept, representing condition control and batch 0 + 3 condition columns and a batch column.
It has the shape n_obs x (intercept, IL25, NMU, IL25_NMU, batch)

In [13]:
design_loc = np.stack((intercept, cond_IL25, cond_NMU, cond_NMU_IL25), axis=-1)
print(design_loc)
print(design_loc.shape)


[[1. 0. 1. 0.]
 [1. 0. 1. 0.]
 [1. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 1.]
 [1. 0. 1. 0.]]
(10000, 4)


`design_loc` and `design_scale` are the same in this case, so we just assign it here: 

In [14]:
design_scale = design_loc

### Create Size Factors

In [15]:
size_factors_10000 = np.sum(x_10000, axis=1)/(n_obs)
size_factors_10000 = size_factors_10000 / np.mean(size_factors_10000)
size_factors_5000 = np.sum(x_5000, axis=1)/(n_obs)
size_factors_5000 = size_factors_5000 / np.mean(size_factors_5000)
size_factors_1000 = np.sum(x_1000, axis=1)/(n_obs)
size_factors_1000 = size_factors_1000 / np.mean(size_factors_1000)
print(size_factors_1000.shape)

(10000,)


Check for zeros to identify potential columns with only zeros (features without any counts):

In [16]:
non_zero_idx = np.where(size_factors > 0)[0]
print(non_zero_idx)
print(x.shape)
if len(non_zero_idx) < len(size_factors):
    x = x[:,non_zero_idx]
    size_factors = size_factors[non_zero_idx]

NameError: name 'size_factors' is not defined

Empty array, thus all features have at least 1 count. 

### Create Input Data Objects

In [21]:
input_data_10000 = glm.models.glm_nb.InputDataGLM(data=x_10000, design_loc=design_loc, design_scale=design_scale, size_factors=size_factors_10000)
input_data_5000 = glm.models.glm_nb.InputDataGLM(data=x_5000, design_loc=design_loc, design_scale=design_scale, size_factors=size_factors_5000)
input_data_1000 = glm.models.glm_nb.InputDataGLM(data=x_1000, design_loc=design_loc, design_scale=design_scale, size_factors=size_factors_1000)

## Fit Models with IRLS

In [22]:
times_standard_irls = np.zeros(3)
times_fb_irls = np.zeros(3)
for i, input_data in enumerate([input_data_1000, input_data_5000, input_data_10000]):
    
    estimator = glm.models.glm_nb.Estimator(input_data, init_a = "standard", init_b = "standard", quick_scale=quick_scale)
    estimator.initialize()
    
    t0_analytic = time.time()
    estimator.train(batched_model=False, batch_size=1000, optimizer="irls_gd_tr", learning_rate=1e-2, 
                convergence_criteria="all_converged", stopping_criteria=30, autograd=autograd, featurewise=False)
    t1_analytic = time.time()
    times_standard_irls[i] = t1_analytic - t0_analytic
    
    del estimator
    estimator2 = glm.models.glm_nb.Estimator(input_data, 
                init_a = "standard", init_b = "standard", quick_scale=quick_scale)
    estimator2.initialize()
    
    t0_tf = time.time()
    estimator2.train(batched_model=False, batch_size=1000, optimizer="irls_gd_tr", learning_rate=1e-2,
                convergence_criteria="all_converged", stopping_criteria=30, autograd=autograd, featurewise=True)
    t1_tf = time.time()
    times_fb_irls[i] = t1_tf - t0_tf
    del estimator2
print('standard', times_standard_irls)
print('featurewise', times_fb_irls)

(10000,)
step 0
Step: 3 loss: 17037.118601371214, converged 162, updated 1000, (logs: 162, grad: 0, x_step: 0)
Step: 4 loss: 17037.031380757515, converged 799, updated 838, (logs: 637, grad: 0, x_step: 0)
Step: 5 loss: 17037.026009157198, converged 971, updated 201, (logs: 172, grad: 0, x_step: 0)
Step: 6 loss: 17037.02595248672, converged 996, updated 29, (logs: 25, grad: 0, x_step: 0)
Step: 7 loss: 17037.025952475637, converged 998, updated 4, (logs: 2, grad: 0, x_step: 0)
Step: 8 loss: 17037.025952475607, converged 1000, updated 2, (logs: 2, grad: 0, x_step: 0)
(10000,)
step 0
Step: 3 loss: 17037.118601371214, converged 162, updated 1000, (logs: 162, grad: 0, x_step: 0)
Step: 4 loss: 17037.031380757515, converged 799, updated 838, (logs: 637, grad: 0, x_step: 0)
Step: 5 loss: 17037.02600915722, converged 971, updated 201, (logs: 172, grad: 0, x_step: 0)
Step: 6 loss: 17037.025952486765, converged 996, updated 29, (logs: 25, grad: 0, x_step: 0)
Step: 7 loss: 17037.025952475717, conve

## Fit Models with ADAM

ATTENTION: Fitting with ADAM takes considerably longer than using IRLS. Runtimes for 10000 obs and 10000 features are expected to run many hours, probably longer than one day (tested with Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz using 8 parallel computations)

In [None]:
times_standard_adam = np.zeros(3)
times_fb_adam = np.zeros(3)
for i, input_data in enumerate([input_data_1000]):
    
    estimator = glm.models.glm_nb.Estimator(input_data, init_a = "standard", init_b = "standard", quick_scale=quick_scale)
    estimator.initialize()
    
    t0_analytic = time.time()
    estimator.train(batched_model=False, batch_size=1000, optimizer="adam", learning_rate=1e-2, 
                convergence_criteria="all_converged", stopping_criteria=30, autograd=autograd, featurewise=False)
    t1_analytic = time.time()
    times_standard_adam[i] = t1_analytic - t0_analytic
    
    del estimator
    estimator2 = glm.models.glm_nb.Estimator(input_data, 
                init_a = "standard", init_b = "standard", quick_scale=quick_scale)
    estimator.initialize()
    
    t0_tf = time.time()
    estimator2.train(batched_model=False, batch_size=1000, optimizer="adam", learning_rate=1e-2,
                convergence_criteria="all_converged", stopping_criteria=30, autograd=autograd, featurewise=True)
    t1_tf = time.time()
    times_fb_adam[i] = t1_tf - t0_tf
    del estimator2
print('standard', times_standard_adam)
print('featurewise', times_fb_adam)

In [None]:
if plot_figures:# data to plot
    n_groups = 3

    # create plot
    fig, ax = plt.subplots(figsize=(3.2,4.4))
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, times_standard_irls, bar_width,
    alpha=opacity,
    color='b',
    label='standard')

    rects2 = plt.bar(index + bar_width, times_fb_irls, bar_width,
    alpha=opacity,
    color='g',
    label='featurewise')

    plt.xlabel('number of observations/features')
    plt.ylabel('runtime in sec')
    #plt.title('Scores by person')
    plt.xticks(index + bar_width, ('20000/10000', '10000/5000', '2000/2000'))
    plt.legend()

    if save_figure:
        plt.tight_layout()
        plt.savefig('/home/mario/PSC19/batchglm_tf2/irls_runtimes.pdf', bbox_inches="tight")
    plt.show()

In [None]:
if plot_figures:
    
    # data to plot
    n_groups = 3

    # create plot
    fig, ax = plt.subplots(figsize=(3.2,4.4))
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, times_standard_adam, bar_width,
    alpha=opacity,
    color='b',
    label='standard')

    rects2 = plt.bar(index + bar_width, times_fb_adam, bar_width,
    alpha=opacity,
    color='g',
    label='featurewise')

    plt.xlabel('number of observations/features')
    plt.ylabel('runtime in sec')
    #plt.title('Scores by person')
    plt.xticks(index + bar_width, ('20000/10000', '10000/5000', '2000/2000'))
    plt.legend()


    if save_figures:
        plt.tight_layout()
        plt.savefig(out_dir+'adam_runtimes.pdf', bbox_inches="tight")

    plt.show()