diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index 74c9beb..236a209 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -2,6 +2,7 @@ import numpy.linalg import scipy.sparse import scipy.stats +import sys from typing import Union @@ -263,7 +264,9 @@ def wald_test_chisq( raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries') theta_diff = theta_mle - theta0 - wald_statistic = np.array([ + invertible = np.where(np.linalg.cond(theta_covar, p=None) < 1 / sys.float_info.epsilon)[0] + wald_statistic = np.zeros([theta_covar.shape[0]]) + np.nan + wald_statistic[invertible] = np.array([ np.matmul( np.matmul( theta_diff[:, [i]].T, @@ -271,8 +274,10 @@ def wald_test_chisq( ), theta_diff[:, [i]] ) - for i in range(theta_diff.shape[1]) + for i in invertible ]).flatten() + if invertible.shape[0] < theta_covar.shape[0]: + print("caught %i linalg singular matrix errors" % (theta_covar.shape[0] - invertible.shape[0])) pvals = 1 - scipy.stats.chi2(theta_mle.shape[0]).cdf(wald_statistic) return pvals diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index 582d3b1..c378494 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -1026,7 +1026,7 @@ def two_sample( if noise_model is None: raise ValueError("Please specify noise_model") formula_loc = '~ 1 + grouping' - formula_scale = '~ 1 + grouping' + formula_scale = '~ 1' de_test = wald( data=data, factor_loc_totest="grouping", @@ -1038,8 +1038,6 @@ def two_sample( sample_description=sample_description, noise_model=noise_model, size_factors=size_factors, - init_a="closed_form", - init_b="closed_form", batch_size=batch_size, backend=backend, train_args=train_args, diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py index 295f386..6763a23 100644 --- a/diffxpy/testing/utils.py +++ b/diffxpy/testing/utils.py @@ -271,3 +271,49 @@ def constraint_system_from_star( constraints=constraints, return_type=return_type ) + + +def bin_continuous_covariate( + factor_to_bin: str, + bins: Union[int, list, np.ndarray, Tuple], + data: Union[None, anndata.AnnData] = None, + sample_description: Union[None, pd.DataFrame] = None +): + r""" + Bin a continuous covariate. + + Adds the binned covariate to the table. If data is supplied, the covariate is added in place in data.obs, otherwise + the covariate is added in the sample_description and the new sample_description is returned. + Binning is performed on quantiles of the distribution. + + :param factor_to_bin: Name of columns of factor to bin. + :param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the + quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5. + :param data: Anndata object that contains sample description table in .obs. + :param sample_description: Sample description table. + :return: Sample description table with binned covariate added if sample_description was supplied, otherwise None is + returned as the new column was added in place. + """ + if data is None and sample_description is not None: + sd = sample_description + elif data is not None and sample_description is None: + sd = data.obs + else: + raise ValueError("supply either data or sample_description") + if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple): + bins = np.asarray(bins) + else: + bins = np.arange(0, 1, 1 / bins) + + fac_binned = glm.data.bin_continuous_covariate( + sample_description=sd, + factor_to_bin=factor_to_bin, + bins=bins + ) + if data is None and sample_description is not None: + sd[factor_to_bin + "_binned"] = fac_binned + return sample_description + elif data is not None and sample_description is None: + data.obs[factor_to_bin + "_binned"] = fac_binned + else: + assert False diff --git a/diffxpy/unit_test/test_twosample.py b/diffxpy/unit_test/test_twosample.py new file mode 100644 index 0000000..7cf357e --- /dev/null +++ b/diffxpy/unit_test/test_twosample.py @@ -0,0 +1,168 @@ +import logging +import unittest +import numpy as np +import pandas as pd +import scipy.stats as stats + +import diffxpy.api as de + + +class TestTwosample(unittest.TestCase): + + def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + """ + Test if de.test_wald_loc() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distriubution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + from batchglm.api.models.numpy.glm_nb import Simulator + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(n_groups, size=sim.nobs) + }) + + test = de.test.two_sample( + data=sim.input_data, + grouping=random_sample_description["condition"].values, + test="wald", + noise_model="nb", + ) + summary = test.summary() + + # Compare p-value distribution under null model against uniform distribution. + pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue + + logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) + assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) + + return True + + def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + """ + Test if de.test_wald_loc() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distriubution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + from batchglm.api.models.numpy.glm_nb import Simulator + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(n_groups, size=sim.nobs) + }) + + test = de.test.two_sample( + data=sim.input_data, + grouping=random_sample_description["condition"], + test="wald", + noise_model="nb", + ) + summary = test.summary() + + # Compare p-value distribution under null model against uniform distribution. + pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue + + logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) + assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) + + return True + + def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + """ + Test if de.test_wald_loc() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distriubution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + from batchglm.api.models.numpy.glm_nb import Simulator + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(n_groups, size=sim.nobs) + }) + + test = de.test.two_sample( + data=sim.input_data, + grouping=random_sample_description["condition"], + test="rank" + ) + summary = test.summary() + + # Compare p-value distribution under null model against uniform distribution. + pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue + + logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) + assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) + + return True + + def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + """ + Test if de.test_wald_loc() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distriubution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + from batchglm.api.models.numpy.glm_nb import Simulator + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(n_groups, size=sim.nobs) + }) + + test = de.test.two_sample( + data=sim.input_data, + grouping=random_sample_description["condition"], + test="t_test" + ) + summary = test.summary() + + # Compare p-value distribution under null model against uniform distribution. + pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue + + logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0) + assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5) + + return True + + +if __name__ == '__main__': + unittest.main() diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py index e0db0eb..bef144a 100644 --- a/diffxpy/unit_test/test_vsrest.py +++ b/diffxpy/unit_test/test_vsrest.py @@ -33,7 +33,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n }) test = de.test.versus_rest( - data=sim.x, + data=sim.input_data, grouping="condition", test="wald", noise_model="nb", @@ -76,7 +76,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): }) test = de.test.versus_rest( - data=sim.x, + data=sim.input_data, grouping="condition", test="lrt", noise_model="nb", @@ -119,7 +119,7 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n }) test = de.test.versus_rest( - data=sim.x, + data=sim.input_data, grouping="condition", test="rank", sample_description=random_sample_description, @@ -159,7 +159,7 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, }) test = de.test.versus_rest( - data=sim.x, + data=sim.input_data, grouping="condition", test="t-test", sample_description=random_sample_description, diff --git a/requirements.txt b/requirements.txt index 497cd7e..33750e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.14.0 scipy pandas patsy>=0.5.0 -batchglm>=0.7.3 +batchglm>=0.7.4 statsmodels anndata seaborn diff --git a/setup.py b/setup.py index d837d56..7b98e2a 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ 'scipy>=1.2.1', 'pandas', 'patsy>=0.5.0', - 'batchglm>=0.7.3', + 'batchglm>=0.7.4', 'statsmodels', ], extras_require={