diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py
index 74c9beb..236a209 100644
--- a/diffxpy/stats/stats.py
+++ b/diffxpy/stats/stats.py
@@ -2,6 +2,7 @@
 import numpy.linalg
 import scipy.sparse
 import scipy.stats
+import sys
 from typing import Union
 
 
@@ -263,7 +264,9 @@ def wald_test_chisq(
             raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
 
     theta_diff = theta_mle - theta0
-    wald_statistic = np.array([
+    invertible = np.where(np.linalg.cond(theta_covar, p=None) < 1 / sys.float_info.epsilon)[0]
+    wald_statistic = np.zeros([theta_covar.shape[0]]) + np.nan
+    wald_statistic[invertible] = np.array([
         np.matmul(
             np.matmul(
                 theta_diff[:, [i]].T,
@@ -271,8 +274,10 @@ def wald_test_chisq(
             ),
             theta_diff[:, [i]]
         )
-        for i in range(theta_diff.shape[1])
+        for i in invertible
     ]).flatten()
+    if invertible.shape[0] < theta_covar.shape[0]:
+        print("caught %i linalg singular matrix errors" % (theta_covar.shape[0] - invertible.shape[0]))
     pvals = 1 - scipy.stats.chi2(theta_mle.shape[0]).cdf(wald_statistic)
     return pvals
 
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
index 582d3b1..c378494 100644
--- a/diffxpy/testing/tests.py
+++ b/diffxpy/testing/tests.py
@@ -1026,7 +1026,7 @@ def two_sample(
         if noise_model is None:
             raise ValueError("Please specify noise_model")
         formula_loc = '~ 1 + grouping'
-        formula_scale = '~ 1 + grouping'
+        formula_scale = '~ 1'
         de_test = wald(
             data=data,
             factor_loc_totest="grouping",
@@ -1038,8 +1038,6 @@ def two_sample(
             sample_description=sample_description,
             noise_model=noise_model,
             size_factors=size_factors,
-            init_a="closed_form",
-            init_b="closed_form",
             batch_size=batch_size,
             backend=backend,
             train_args=train_args,
diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py
index 295f386..6763a23 100644
--- a/diffxpy/testing/utils.py
+++ b/diffxpy/testing/utils.py
@@ -271,3 +271,49 @@ def constraint_system_from_star(
         constraints=constraints,
         return_type=return_type
     )
+
+
+def bin_continuous_covariate(
+        factor_to_bin: str,
+        bins: Union[int, list, np.ndarray, Tuple],
+        data: Union[None, anndata.AnnData] = None,
+        sample_description: Union[None, pd.DataFrame] = None
+):
+    r"""
+    Bin a continuous covariate.
+
+    Adds the binned covariate to the table. If data is supplied, the covariate is added in place in data.obs, otherwise
+    the covariate is added in the sample_description and the new sample_description is returned.
+     Binning is performed on quantiles of the distribution.
+
+    :param factor_to_bin: Name of columns of factor to bin.
+    :param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the
+        quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5.
+    :param data: Anndata object that contains sample description table in .obs.
+    :param sample_description: Sample description table.
+    :return: Sample description table with binned covariate added if sample_description was supplied, otherwise None is
+        returned as the new column was added in place.
+    """
+    if data is None and sample_description is not None:
+        sd = sample_description
+    elif data is not None and sample_description is None:
+        sd = data.obs
+    else:
+        raise ValueError("supply either data or sample_description")
+    if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple):
+        bins = np.asarray(bins)
+    else:
+        bins = np.arange(0, 1, 1 / bins)
+
+    fac_binned = glm.data.bin_continuous_covariate(
+        sample_description=sd,
+        factor_to_bin=factor_to_bin,
+        bins=bins
+    )
+    if data is None and sample_description is not None:
+        sd[factor_to_bin + "_binned"] = fac_binned
+        return sample_description
+    elif data is not None and sample_description is None:
+        data.obs[factor_to_bin + "_binned"] = fac_binned
+    else:
+        assert False
diff --git a/diffxpy/unit_test/test_twosample.py b/diffxpy/unit_test/test_twosample.py
new file mode 100644
index 0000000..7cf357e
--- /dev/null
+++ b/diffxpy/unit_test/test_twosample.py
@@ -0,0 +1,168 @@
+import logging
+import unittest
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+
+import diffxpy.api as de
+
+
+class TestTwosample(unittest.TestCase):
+
+    def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
+        """
+        Test if de.test_wald_loc() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distriubution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+        from batchglm.api.models.numpy.glm_nb import Simulator
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(n_groups, size=sim.nobs)
+        })
+
+        test = de.test.two_sample(
+            data=sim.input_data,
+            grouping=random_sample_description["condition"].values,
+            test="wald",
+            noise_model="nb",
+        )
+        summary = test.summary()
+
+        # Compare p-value distribution under null model against uniform distribution.
+        pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
+
+        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
+        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
+
+        return True
+
+    def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
+        """
+        Test if de.test_wald_loc() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distriubution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+        from batchglm.api.models.numpy.glm_nb import Simulator
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(n_groups, size=sim.nobs)
+        })
+
+        test = de.test.two_sample(
+            data=sim.input_data,
+            grouping=random_sample_description["condition"],
+            test="wald",
+            noise_model="nb",
+        )
+        summary = test.summary()
+
+        # Compare p-value distribution under null model against uniform distribution.
+        pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
+
+        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
+        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
+
+        return True
+
+    def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
+        """
+        Test if de.test_wald_loc() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distriubution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+        from batchglm.api.models.numpy.glm_nb import Simulator
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(n_groups, size=sim.nobs)
+        })
+
+        test = de.test.two_sample(
+            data=sim.input_data,
+            grouping=random_sample_description["condition"],
+            test="rank"
+        )
+        summary = test.summary()
+
+        # Compare p-value distribution under null model against uniform distribution.
+        pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
+
+        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
+        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
+
+        return True
+
+    def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
+        """
+        Test if de.test_wald_loc() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distriubution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+        from batchglm.api.models.numpy.glm_nb import Simulator
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(n_groups, size=sim.nobs)
+        })
+
+        test = de.test.two_sample(
+            data=sim.input_data,
+            grouping=random_sample_description["condition"],
+            test="t_test"
+        )
+        summary = test.summary()
+
+        # Compare p-value distribution under null model against uniform distribution.
+        pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue
+
+        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
+        assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)
+
+        return True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py
index e0db0eb..bef144a 100644
--- a/diffxpy/unit_test/test_vsrest.py
+++ b/diffxpy/unit_test/test_vsrest.py
@@ -33,7 +33,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n
         })
 
         test = de.test.versus_rest(
-            data=sim.x,
+            data=sim.input_data,
             grouping="condition",
             test="wald",
             noise_model="nb",
@@ -76,7 +76,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100):
         })
 
         test = de.test.versus_rest(
-            data=sim.x,
+            data=sim.input_data,
             grouping="condition",
             test="lrt",
             noise_model="nb",
@@ -119,7 +119,7 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n
         })
 
         test = de.test.versus_rest(
-            data=sim.x,
+            data=sim.input_data,
             grouping="condition",
             test="rank",
             sample_description=random_sample_description,
@@ -159,7 +159,7 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100,
         })
 
         test = de.test.versus_rest(
-            data=sim.x,
+            data=sim.input_data,
             grouping="condition",
             test="t-test",
             sample_description=random_sample_description,
diff --git a/requirements.txt b/requirements.txt
index 497cd7e..33750e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ numpy>=1.14.0
 scipy
 pandas
 patsy>=0.5.0
-batchglm>=0.7.3
+batchglm>=0.7.4
 statsmodels
 anndata
 seaborn
diff --git a/setup.py b/setup.py
index d837d56..7b98e2a 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
         'scipy>=1.2.1',
         'pandas',
         'patsy>=0.5.0',
-        'batchglm>=0.7.3',
+        'batchglm>=0.7.4',
         'statsmodels',
     ],
     extras_require={