In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import scipy.stats
import numpy as np
import scanpy.api as sc

import diffxpy.api as de

# Generate some data:

In [2]:
from batchglm.api.models.glm_nb import Simulator

sim = Simulator(num_observations=2000, num_features=100)
sim.generate_sample_description(num_batches=0, num_conditions=2)
sim.generate_params()
sim.generate_data()

Create anndata object:

In [3]:
adata = sc.AnnData(X=np.asarray(sim.X), obs=sim.sample_description)

From here on, we can treat the anndata object as a container of the count matrix, the sample_description and the gene_names and we only pass adata to the diffxpy functions.

# Create annotated reference set

In [4]:
rs = de.enrich.RefSets()
rs.add(id="setA", source="made_up", gene_ids=np.array(['2', '5', '22', '23']))
rs.add(id="setB", source="made_up", gene_ids=np.array(['22', '15', '16', '44', '55', '98', '99']))

# Run differential expression test:

The t-test checks if two groups of samples differ significantly in one gene.

Therefore, it has to be provided with a parameter `grouping` which specifies the group membership of each sample.
It can be either the name of a column in `sample_description` or a vector of length `num_observations`.


In [6]:
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

test = de.test.t_test(
    data=adata,
    grouping="condition",
    dtype="float64"
)

# Perform enrichment

In [7]:
enr = de.enrich.test(DETest=test, RefSets=rs, de_threshold=0.005, clean_ref=False)

10 overlaps found between refset (10) and provided gene list (100).


In [8]:
enr.summary()

Unnamed: 0,set,pval,qval,intersection,reference,enquiry,background
0,setA,0.424291,0.602433,4,4,81,100
1,setB,0.602433,0.602433,6,7,81,100
