In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import logging

import scipy.stats
import numpy as np
import scanpy.api as sc

  return f(*args, **kwds)


# Generate some data:

In [2]:
from batchglm.api.models.nb_glm import Simulator

sim = Simulator(num_observations=2000, num_features=100)
sim.generate_sample_description(num_batches=0, num_confounders=2)
# sample parameters from positive truncated N(1, 0.1)
mu=1; phi=0.1
sim.generate_params(rand_fn_loc=lambda size: mu + scipy.stats.truncnorm.rvs(-mu / phi, np.infty, scale=phi, size=size))
sim.generate_data()

# count data
X = sim.X
# sample description
sample_description = sim.sample_description

The sample description should be a pandas DataFrame with `num_observations` rows.
Each column should represent a property of the dataset.

The module `batchglm.api.data` contains some helper functions which can be useful to create this sample description:

- `sample_description_from_anndata()`
- `sample_description_from_xarray()`

In [3]:
sample_description.iloc[:10,:]

Unnamed: 0_level_0,condition
observations,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,0
5,1
6,0
7,1
8,0
9,1


Create anndata object:

In [4]:
adata = sc.AnnData(X=np.asarray(X), obs=sample_description)

From here on, we can treat the anndata object as a container of the count matrix, the sample_description and the gene_names and we only pass adata to the diffxpy functions.

# Create annotated reference set

In [5]:
import diffxpy.api as de

In [13]:
rs = de.enrich.RefSets()
rs.add(id="setA", source="made_up", gene_ids=np.array(['2', '5', '22', '23']))
rs.add(id="setB", source="made_up", gene_ids=np.array(['22', '15', '16', '44', '55', '98', '99']))

# Run differential expression test:

The t-test checks if two groups of samples differ significantly in one gene.

Therefore, it has to be provided with a parameter `grouping` which specifies the group membership of each sample.
It can be either the name of a column in `sample_description` or a vector of length `num_observations`.


In [7]:
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

test = de.test.t_test(
    data=adata,
    grouping="condition"
)


# Perform enrichment

In [16]:
enr = de.enrich.test(DETest=test, RefSets=rs, de_threshold=0.005, clean_ref=False)

10 overlaps found between refset (10) and provided gene list (100).


In [17]:
enr.summary()

Unnamed: 0,set,pval,qval,intersection,reference,enquiry,background
1,setB,0.309319,0.618637,5,7,55,100
0,setA,0.762971,0.762971,2,4,55,100
