Skip to content
This repository was archived by the owner on Oct 21, 2025. It is now read-only.
Merged

Dev #147

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions diffxpy/stats/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy.linalg
import scipy.sparse
import scipy.stats
import sys
from typing import Union


Expand Down Expand Up @@ -263,16 +264,20 @@ def wald_test_chisq(
raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')

theta_diff = theta_mle - theta0
wald_statistic = np.array([
invertible = np.where(np.linalg.cond(theta_covar, p=None) < 1 / sys.float_info.epsilon)[0]
wald_statistic = np.zeros([theta_covar.shape[0]]) + np.nan
wald_statistic[invertible] = np.array([
np.matmul(
np.matmul(
theta_diff[:, [i]].T,
numpy.linalg.inv(theta_covar[i, :, :])
),
theta_diff[:, [i]]
)
for i in range(theta_diff.shape[1])
for i in invertible
]).flatten()
if invertible.shape[0] < theta_covar.shape[0]:
print("caught %i linalg singular matrix errors" % (theta_covar.shape[0] - invertible.shape[0]))
pvals = 1 - scipy.stats.chi2(theta_mle.shape[0]).cdf(wald_statistic)
return pvals

Expand Down
4 changes: 1 addition & 3 deletions diffxpy/testing/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,7 @@ def two_sample(
if noise_model is None:
raise ValueError("Please specify noise_model")
formula_loc = '~ 1 + grouping'
formula_scale = '~ 1 + grouping'
formula_scale = '~ 1'
de_test = wald(
data=data,
factor_loc_totest="grouping",
Expand All @@ -1038,8 +1038,6 @@ def two_sample(
sample_description=sample_description,
noise_model=noise_model,
size_factors=size_factors,
init_a="closed_form",
init_b="closed_form",
batch_size=batch_size,
backend=backend,
train_args=train_args,
Expand Down
46 changes: 46 additions & 0 deletions diffxpy/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,49 @@ def constraint_system_from_star(
constraints=constraints,
return_type=return_type
)


def bin_continuous_covariate(
factor_to_bin: str,
bins: Union[int, list, np.ndarray, Tuple],
data: Union[None, anndata.AnnData] = None,
sample_description: Union[None, pd.DataFrame] = None
):
r"""
Bin a continuous covariate.

Adds the binned covariate to the table. If data is supplied, the covariate is added in place in data.obs, otherwise
the covariate is added in the sample_description and the new sample_description is returned.
Binning is performed on quantiles of the distribution.

:param factor_to_bin: Name of columns of factor to bin.
:param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the
quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5.
:param data: Anndata object that contains sample description table in .obs.
:param sample_description: Sample description table.
:return: Sample description table with binned covariate added if sample_description was supplied, otherwise None is
returned as the new column was added in place.
"""
if data is None and sample_description is not None:
sd = sample_description
elif data is not None and sample_description is None:
sd = data.obs
else:
raise ValueError("supply either data or sample_description")
if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple):
bins = np.asarray(bins)
else:
bins = np.arange(0, 1, 1 / bins)

fac_binned = glm.data.bin_continuous_covariate(
sample_description=sd,
factor_to_bin=factor_to_bin,
bins=bins
)
if data is None and sample_description is not None:
sd[factor_to_bin + "_binned"] = fac_binned
return sample_description
elif data is not None and sample_description is None:
data.obs[factor_to_bin + "_binned"] = fac_binned
else:
assert False
168 changes: 168 additions & 0 deletions diffxpy/unit_test/test_twosample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import logging
import unittest
import numpy as np
import pandas as pd
import scipy.stats as stats

import diffxpy.api as de


class TestTwosample(unittest.TestCase):

def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
"""
Test if de.test_wald_loc() generates a uniform p-value distribution
if it is given data simulated based on the null model. Returns the p-value
of the two-side Kolmgorov-Smirnov test for equality of the observed
p-value distriubution and a uniform distribution.

:param n_cells: Number of cells to simulate (number of observations per test).
:param n_genes: Number of genes to simulate (number of tests).
"""
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.WARNING)
logging.getLogger("diffxpy").setLevel(logging.WARNING)
from batchglm.api.models.numpy.glm_nb import Simulator

sim = Simulator(num_observations=n_cells, num_features=n_genes)
sim.generate_sample_description(num_batches=0, num_conditions=0)
sim.generate()

random_sample_description = pd.DataFrame({
"condition": np.random.randint(n_groups, size=sim.nobs)
})

test = de.test.two_sample(
data=sim.input_data,
grouping=random_sample_description["condition"].values,
test="wald",
noise_model="nb",
)
summary = test.summary()

# Compare p-value distribution under null model against uniform distribution.
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue

logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)

return True

def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
"""
Test if de.test_wald_loc() generates a uniform p-value distribution
if it is given data simulated based on the null model. Returns the p-value
of the two-side Kolmgorov-Smirnov test for equality of the observed
p-value distriubution and a uniform distribution.

:param n_cells: Number of cells to simulate (number of observations per test).
:param n_genes: Number of genes to simulate (number of tests).
"""
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.WARNING)
logging.getLogger("diffxpy").setLevel(logging.WARNING)
from batchglm.api.models.numpy.glm_nb import Simulator

sim = Simulator(num_observations=n_cells, num_features=n_genes)
sim.generate_sample_description(num_batches=0, num_conditions=0)
sim.generate()

random_sample_description = pd.DataFrame({
"condition": np.random.randint(n_groups, size=sim.nobs)
})

test = de.test.two_sample(
data=sim.input_data,
grouping=random_sample_description["condition"],
test="wald",
noise_model="nb",
)
summary = test.summary()

# Compare p-value distribution under null model against uniform distribution.
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue

logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)

return True

def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
"""
Test if de.test_wald_loc() generates a uniform p-value distribution
if it is given data simulated based on the null model. Returns the p-value
of the two-side Kolmgorov-Smirnov test for equality of the observed
p-value distriubution and a uniform distribution.

:param n_cells: Number of cells to simulate (number of observations per test).
:param n_genes: Number of genes to simulate (number of tests).
"""
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.WARNING)
logging.getLogger("diffxpy").setLevel(logging.WARNING)
from batchglm.api.models.numpy.glm_nb import Simulator

sim = Simulator(num_observations=n_cells, num_features=n_genes)
sim.generate_sample_description(num_batches=0, num_conditions=0)
sim.generate()

random_sample_description = pd.DataFrame({
"condition": np.random.randint(n_groups, size=sim.nobs)
})

test = de.test.two_sample(
data=sim.input_data,
grouping=random_sample_description["condition"],
test="rank"
)
summary = test.summary()

# Compare p-value distribution under null model against uniform distribution.
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue

logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)

return True

def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2):
"""
Test if de.test_wald_loc() generates a uniform p-value distribution
if it is given data simulated based on the null model. Returns the p-value
of the two-side Kolmgorov-Smirnov test for equality of the observed
p-value distriubution and a uniform distribution.

:param n_cells: Number of cells to simulate (number of observations per test).
:param n_genes: Number of genes to simulate (number of tests).
"""
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.WARNING)
logging.getLogger("diffxpy").setLevel(logging.WARNING)
from batchglm.api.models.numpy.glm_nb import Simulator

sim = Simulator(num_observations=n_cells, num_features=n_genes)
sim.generate_sample_description(num_batches=0, num_conditions=0)
sim.generate()

random_sample_description = pd.DataFrame({
"condition": np.random.randint(n_groups, size=sim.nobs)
})

test = de.test.two_sample(
data=sim.input_data,
grouping=random_sample_description["condition"],
test="t_test"
)
summary = test.summary()

# Compare p-value distribution under null model against uniform distribution.
pval_h0 = stats.kstest(test.pval.flatten(), 'uniform').pvalue

logging.getLogger("diffxpy").info('KS-test pvalue for null model match of test_wald_loc(): %f' % pval_h0)
assert pval_h0 > 0.05, "KS-Test failed: pval_h0=%f is <= 0.05!" % np.round(pval_h0, 5)

return True


if __name__ == '__main__':
unittest.main()
8 changes: 4 additions & 4 deletions diffxpy/unit_test/test_vsrest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n
})

test = de.test.versus_rest(
data=sim.x,
data=sim.input_data,
grouping="condition",
test="wald",
noise_model="nb",
Expand Down Expand Up @@ -76,7 +76,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100):
})

test = de.test.versus_rest(
data=sim.x,
data=sim.input_data,
grouping="condition",
test="lrt",
noise_model="nb",
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n
})

test = de.test.versus_rest(
data=sim.x,
data=sim.input_data,
grouping="condition",
test="rank",
sample_description=random_sample_description,
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100,
})

test = de.test.versus_rest(
data=sim.x,
data=sim.input_data,
grouping="condition",
test="t-test",
sample_description=random_sample_description,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ numpy>=1.14.0
scipy
pandas
patsy>=0.5.0
batchglm>=0.7.3
batchglm>=0.7.4
statsmodels
anndata
seaborn
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
'scipy>=1.2.1',
'pandas',
'patsy>=0.5.0',
'batchglm>=0.7.3',
'batchglm>=0.7.4',
'statsmodels',
],
extras_require={
Expand Down