In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
print(ad.__version__)

0.10.9


In [3]:
#initializing data 

counts = csr_matrix(np.random.poisson(1, size=(100,2000)), dtype=np.float32)
adata_raw = ad.AnnData(counts)
adata_raw

AnnData object with n_obs × n_vars = 100 × 2000

In [5]:
#sparse matrix
adata_raw.X

<100x2000 sparse matrix of type '<class 'numpy.float32'>'
	with 126243 stored elements in Compressed Sparse Row format>

In [11]:
adata_raw.obs_names = [f"Cell_{i:d}" for i in range (adata_raw.n_obs)]
adata_raw.var_names = [f"Gene_{i:d}" for i in range(adata_raw.n_vars)]

print(adata_raw.obs_names[:10])

Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',
       'Cell_7', 'Cell_8', 'Cell_9'],
      dtype='object')


In [12]:
#Subsetting Anndata
#Above we created the index values, wich are used
#to subset the Anndata
adata_raw[["Cell_1","Cell_10"],["Gene_5","Gene_1900"]]

View of AnnData object with n_obs × n_vars = 2 × 2

In [15]:
#Adding Aligned metadata
ct = np.random.choice(["B","T","Monocyte"], size=(adata_raw.n_obs,))
adata_raw.obs['cell_type'] = pd.Categorical(ct)
adata_raw.obs

Unnamed: 0,cell_type
Cell_0,T
Cell_1,T
Cell_2,Monocyte
Cell_3,B
Cell_4,B
...,...
Cell_95,B
Cell_96,B
Cell_97,T
Cell_98,Monocyte


In [17]:
# Subsetting using metadata
bdata = adata_raw[adata_raw.obs.cell_type =="B"]
bdata

View of AnnData object with n_obs × n_vars = 34 × 2000
    obs: 'cell_type'

In [18]:
## Observation/variable-level matrices
# This is used to store information above features or observations
#For that anndata has .obsm/varm
#The only limitation is this matrices shoulde be equal to
# the dimensions of obs/var.
adata_raw.obsm["X_umap"] = np.random.normal(0, 1, size=(adata_raw.n_obs, 2))
adata_raw.varm["gene_stuff"] = np.random.normal(0, 1, size=(adata_raw.n_vars, 5))
adata_raw.obsm

AxisArrays with keys: X_umap

A few more notes about .obsm/.varm

The “array-like” metadata can originate from a Pandas DataFrame, scipy sparse matrix, or numpy dense array.

When using scanpy, their values (columns) are not easily plotted, where instead items from .obs are easily plotted on, e.g., UMAP plots.

In [19]:
adata_raw

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'
    obsm: 'X_umap'
    varm: 'gene_stuff'

In [20]:
# Unstructed metadata
#This can be things like general info about the analysis of the data
adata_raw.uns['random'] = [1, 2, 3]
adata_raw.uns

OrderedDict([('random', [1, 2, 3])])

In [22]:
#Layers
# Different forms of our orginal data copy , one can be norma
# normalized THey can be stored in different layers.

adata_raw.layers['log_transformed'] = np.log1p(adata_raw.X)
adata_raw

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'
    uns: 'random'
    obsm: 'X_umap'
    varm: 'gene_stuff'
    layers: 'log_transformed'

In [23]:
#We can convert one of the layers to a dataframe
adata_raw.to_df(layer="log_transformed")

Unnamed: 0,Gene_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_1990,Gene_1991,Gene_1992,Gene_1993,Gene_1994,Gene_1995,Gene_1996,Gene_1997,Gene_1998,Gene_1999
Cell_0,1.098612,0.000000,0.693147,0.693147,0.693147,0.000000,0.000000,0.693147,1.098612,1.098612,...,0.000000,1.098612,0.693147,1.098612,1.386294,0.000000,0.000000,0.000000,1.386294,1.386294
Cell_1,0.000000,0.693147,1.098612,0.693147,0.693147,0.693147,0.000000,0.693147,1.098612,0.693147,...,0.693147,0.000000,0.693147,0.000000,0.693147,0.000000,1.098612,0.693147,0.693147,1.386294
Cell_2,1.098612,0.693147,0.693147,0.000000,0.000000,0.693147,0.693147,1.098612,1.386294,0.693147,...,0.000000,0.693147,1.386294,0.000000,1.098612,1.098612,1.386294,1.386294,0.693147,0.693147
Cell_3,0.000000,0.693147,0.693147,1.098612,1.098612,0.000000,1.098612,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.000000,0.000000,0.000000,1.386294,0.000000,0.000000,0.693147,0.000000
Cell_4,0.693147,0.693147,0.000000,0.000000,1.098612,1.609438,0.000000,0.693147,0.000000,1.098612,...,0.693147,1.386294,0.693147,0.693147,0.000000,1.098612,0.693147,1.098612,0.693147,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cell_95,0.693147,0.693147,0.693147,1.386294,1.098612,0.693147,1.098612,1.098612,0.693147,0.693147,...,0.693147,0.000000,1.098612,0.000000,0.000000,0.693147,1.386294,0.000000,1.098612,0.000000
Cell_96,0.693147,0.000000,0.000000,0.000000,0.693147,1.098612,0.000000,1.098612,0.693147,0.000000,...,0.000000,0.693147,1.098612,0.693147,0.000000,0.693147,0.693147,0.000000,1.386294,0.000000
Cell_97,0.000000,1.609438,0.000000,0.693147,0.693147,0.000000,1.098612,1.098612,0.693147,0.000000,...,1.609438,0.000000,0.693147,0.693147,0.693147,0.693147,0.000000,1.386294,0.693147,0.000000
Cell_98,1.098612,0.693147,0.693147,0.000000,1.098612,0.000000,1.098612,0.693147,0.000000,0.693147,...,0.693147,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.693147,0.000000


AnnData comes with its own persistent HDF5-based file format: h5ad. If string columns with small number of categories aren’t yet categoricals, AnnData will auto-transform to categoricals.



AnnData has become the standard for single-cell analysis in Python and for good reason – it’s straightforward to use and faciliatates more reproducible analyses with it’s key-based storage. It’s even becoming easier to convert to the popular R-based formats for single-cell analysis.

In [24]:
# Writign the results to the disk
adata_raw.write('my_results.h5ad',compression="gzip")