Skip to content
Branch: master
Find file Copy path
Find file Copy path
3 contributors

Users who have contributed to this file

@ivirshup @flying-sheep @Koncopd
2385 lines (2074 sloc) 87.9 KB
"""Main class and helper functions.
from enum import Enum
from collections import OrderedDict
from import MutableMapping
from functools import reduce
from pathlib import Path
from typing import Any, Union, Optional
from typing import Iterable, Sized, Sequence, Mapping
from typing import Tuple, List
from copy import deepcopy
import warnings
import numpy as np
from numpy import ma
import pandas as pd
from pandas.core.index import RangeIndex
from pandas.api.types import is_string_dtype, is_categorical
from scipy import sparse
from scipy.sparse import issparse
from natsort import natsorted
# try importing zarr
from zarr.core import Array as ZarrArray
except ImportError:
class ZarrArray:
def __rep__():
return 'mock zarr.core.Array'
# try importing zappy
from zappy.base import ZappyArray
except ImportError:
class ZappyArray:
def __rep__():
return 'mock zappy.base.ZappyArray'
from .alignedmapping import (
AxisArraysBase, AxisArrays,
# PairwiseArraysBase, PairwiseArrays,
LayersBase, Layers
from .. import h5py
from .views import ArrayView, SparseCSRView, SparseCSCView, DictView, DataFrameView
from .. import utils
from ..utils import Index1D, Index, get_n_items_idx, convert_to_dict, unpack_index
from ..logging import anndata_logger as logger
from ..compat import PathLike, warn_flatten
class StorageType(Enum):
Array = np.ndarray
Masked = ma.MaskedArray
Sparse = sparse.spmatrix
ZarrArry = ZarrArray
ZappyArry = ZappyArray
def classes(cls):
return tuple(c.value for c in cls.__members__.values())
# for backwards compat
def _find_corresponding_multicol_key(key, keys_multicol):
"""Find the corresponding multicolumn key."""
for mk in keys_multicol:
if key.startswith(mk) and 'of' in key:
return mk
return None
# for backwards compat
def _gen_keys_from_multicol_key(key_multicol, n_keys):
"""Generates single-column keys from multicolumn key."""
keys = [('{}{:03}of{:03}')
.format(key_multicol, i+1, n_keys) for i in range(n_keys)]
return keys
def df_to_records_fixed_width(df, var_len_str=True):
uns = {} # unstructured dictionary for storing categories
names = ['index']
if is_string_dtype(df.index):
if var_len_str:
index = df.index.values.astype(h5py.special_dtype(vlen=str))
max_len_index = 0 if 0 in df.shape else
index = df.index.values.astype('S{}'.format(max_len_index))
index = df.index.values
arrays = [index]
for k in df.columns:
if is_string_dtype(df[k]) and not is_categorical(df[k]):
if var_len_str:
lengths = df[k].map(len)
if is_categorical(lengths): lengths =
elif is_categorical(df[k]):
uns[k + '_categories'] = df[k].cat.categories
formats = [v.dtype for v in arrays]
return np.rec.fromarrays(
dtype={'names': names, 'formats': formats}), uns
def _check_2d_shape(X):
"""Check shape of array or sparse matrix.
Assure that X is always 2D: Unlike numpy we always deal with 2D arrays.
if X.dtype.names is None and len(X.shape) != 2:
raise ValueError('X needs to be 2-dimensional, not '
def _normalize_index(index, names):
if not isinstance(names, RangeIndex):
assert names.dtype != float and names.dtype != int, \
'Don’t call _normalize_index with non-categorical/string names'
# the following is insanely slow for sequences, we replaced it using pandas below
def name_idx(i):
if isinstance(i, str):
# `where` returns an 1-tuple (1D array) of found indices
i_found = np.where(names == i)[0]
if len(i_found) == 0: # returns array of length 0 if nothing is found
raise IndexError(
'Key "{}" is not valid observation/variable name/index.'
i = i_found[0]
return i
if isinstance(index, slice):
start = name_idx(index.start)
stop = name_idx(index.stop)
# string slices can only be inclusive, so +1 in that case
if isinstance(index.stop, str):
stop = None if stop is None else stop + 1
step = index.step
return slice(start, stop, step)
elif isinstance(index, (np.integer, int, str)):
return name_idx(index)
elif isinstance(index, (Sequence, np.ndarray, pd.Index)):
# here, we replaced the implementation based on name_idx with this
# incredibly faster one
positions = pd.Series(index=names, data=range(len(names)))
positions = positions[index]
if positions.isnull().values.any():
not_found = positions.index[positions.isnull().values]
raise KeyError(
"Values {}, from {}, are not valid obs/ var names or indices."
.format(list(not_found), list(positions.index)))
return positions.values
raise IndexError('Unknown index {!r} of type {}'
.format(index, type(index)))
def _gen_dataframe(anno, length, index_names):
if isinstance(anno, pd.DataFrame):
anno = anno.copy()
if not is_string_dtype(anno.index):
logger.warning('Transforming to str index.')
anno.index = anno.index.astype(str)
return anno
if anno is None or len(anno) == 0:
_anno = pd.DataFrame(index=RangeIndex(0, length, name=None).astype(str))
for index_name in index_names:
if index_name in anno:
_anno = pd.DataFrame(
anno, index=anno[index_name],
columns=[k for k in anno.keys() if k != index_name])
_anno = pd.DataFrame(anno, index=RangeIndex(0, length, name=None).astype(str))
return _anno
class AnnDataFileManager:
"""Backing file manager for AnnData.
def __init__(
adata: 'AnnData',
filename: Optional[PathLike] = None,
filemode: Optional[str] = None,
self._adata = adata
self.filename = filename
self._filemode = filemode
self._file = None
if filename:
def __repr__(self) -> str:
if self.filename is None:
return 'Backing file manager: no file is set.'
return 'Backing file manager of file {}.'.format(self.filename)
def __getitem__(self, key: str) -> Union[h5py.Group, h5py.Dataset, h5py.SparseDataset]:
return self._file[key]
def __setitem__(self, key: str, value: Union[h5py.Group, h5py.Dataset, h5py.SparseDataset]):
self._file[key] = value
def __delitem__(self, key: str):
del self._file[key]
def filename(self) -> Path:
return self._filename
def filename(self, filename: Optional[PathLike]):
self._filename = None if filename is None else Path(filename)
def open(
filename: Optional[PathLike] = None,
filemode: Optional[str] = None,
if filename is not None:
self.filename = filename
if filemode is not None:
self._filemode = filemode
if self.filename is None:
raise ValueError(
'Cannot open backing file if backing not initialized.')
self._file = h5py.File(self.filename, self._filemode, force_dense=True)
def close(self):
"""Close the backing file, remember filename, do *not* change to memory mode."""
if self._file is not None:
def _to_memory_mode(self):
"""Close the backing file, forget filename, *do* change to memory mode."""
self._adata.__X = self._adata.X[()]
self._file = None
self._filename = None
def isopen(self) -> bool:
"""State of backing file."""
if self._file is None:
return False
# try accessing the id attribute to see if the file is open
return bool(
class Raw:
def __init__(
adata: Optional['AnnData'] = None,
X: Union[np.ndarray, sparse.spmatrix, None] = None,
var: Optional[AxisArraysBase] = None,
varm: Optional[AxisArraysBase] = None,
self._adata = adata
self._n_obs = adata.n_obs
if X is not None:
self._X = X
self._var = var
self._varm = varm
self._X = None if adata.isbacked else adata.X.copy()
self._var = adata.var.copy()
self._varm = adata.varm.copy()
def X(self):
if self._adata.isbacked:
if not self._adata.file.isopen:
X = self._adata.file['raw.X']
if self._adata.isview: return X[self._adata._oidx, self._adata._vidx]
else: return X
if self.n_obs == 1 and self.n_vars == 1:
return self._X[0, 0]
elif self.n_obs == 1 or self.n_vars == 1:
X = self._X
if issparse(self._X): X = self._X.toarray()
return X.flatten()
return self._X
def shape(self):
return self.X.shape
def var(self):
return self._var
def n_vars(self):
return self._var.shape[0]
def n_obs(self):
return self._n_obs
def varm(self):
return self._varm
def var_names(self):
return self.var.index
def obs_names(self):
return self._adata.obs_names
def __getitem__(self, index):
oidx, vidx = self._normalize_indices(index)
if self._adata is not None or not self._adata.isbacked: X = self._X[oidx, vidx]
else: X = self._adata.file['raw.X'][oidx, vidx]
if isinstance(vidx, (int, np.integer)):
# To preserve two dimensional shape
vidx = slice(vidx, vidx + 1, 1)
var = self._var.iloc[vidx]
new = Raw(self._adata, X=X, var=var)
if self._varm is not None:
new._varm = self._varm._view(self, vidx)
return new
def copy(self):
return Raw(self._adata, X=self._X.copy(), var=self._var.copy(),
varm=None if self._varm is None else self._varm.copy())
def _normalize_indices(self, packed_index):
# deal with slicing with pd.Series
if isinstance(packed_index, pd.Series):
packed_index = packed_index.values
if isinstance(packed_index, tuple):
if len(packed_index) != 2:
raise IndexDimError(len(packed_index))
if isinstance(packed_index[1], pd.Series):
packed_index = packed_index[0], packed_index[1].values
if isinstance(packed_index[0], pd.Series):
packed_index = packed_index[0].values, packed_index[1]
obs, var = unpack_index(packed_index)
obs = _normalize_index(obs, self._adata.obs_names)
var = _normalize_index(var, self.var_names)
return obs, var
def var_vector(self, k: str) -> np.ndarray:
Convenience function for returning a 1 dimensional ndarray of values
from `.X` or `.var`.
Made for convenience, not performance. Intentionally permissive about
arguments, for easy iterative use.
Key to use. Should be in `.obs_names` or `.var.columns`.
A one dimensional nd array, with values for each var in the same order
as `.var_names`.
if k in self.var:
return self.var[k].values
idx = self._normalize_indices((k, slice(None)))
a = self.X[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
def obs_vector(self, k: str) -> np.ndarray:
Convenience function for returning a 1 dimensional ndarray of values
from `.X`.
Made for convenience, not performance. Intentionally permissive about
arguments, for easy iterative use.
Key to use. Should be in `.var_names` or `.obs.columns`. If `use_raw`,
value should be in `.raw.var_names` instead of `.var_names`.
A one dimensional nd array, with values for each obs in the same order
as `.obs_names`.
idx = self._normalize_indices((slice(None), k))
a = self.X[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
INDEX_DIM_ERROR_MSG = 'You tried to slice an AnnData(View) object with an' \
'{}-dimensional index, but only 2 dimensions exist in such an object.'
INDEX_DIM_ERROR_MSG_1D = '\nIf you tried to slice cells using adata[cells, ], ' \
'be aware that Python (unlike R) uses adata[cells, :] as slicing syntax.'
class IndexDimError(IndexError):
def __init__(self, n_dims):
msg = INDEX_DIM_ERROR_MSG.format(n_dims)
if n_dims == 1:
class AnnData(metaclass=utils.DeprecationMixinMeta):
"""An annotated data matrix.
:class:`~anndata.AnnData` stores a data matrix :attr:`X` together with annotations
of observations :attr:`obs`, variables :attr:`var` and unstructured annotations :attr:`uns`.
.. figure::
:width: 350px
An :class:`~anndata.AnnData` object ``adata`` can be sliced like a pandas
dataframe, for instance, ``adata_subset = adata[:, list_of_variable_names]``.
:class:`~anndata.AnnData`'s basic structure is similar to R's ExpressionSet
[Huber15]_. If setting an ``.h5ad``-formatted HDF5 backing file ``.filename``,
data remains on the disk but is automatically loaded into memory if needed.
See this `blog post`_ for more details.
.. _blog post:
A #observations × #variables data matrix. A view of the data is used if the
data type matches, otherwise, a copy is made.
Key-indexed one-dimensional observations annotation of length #observations.
Key-indexed one-dimensional variables annotation of length #variables.
Key-index unstructured annotation.
Key-indexed multi-dimensional observations annotation of length #observations.
If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype.
Key-indexed multi-dimensional variables annotation of length #variables.
If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype.
Data type used for storage.
Shape tuple (#observations, #variables). Can only be provided if ``X`` is ``None``.
Name of backing file. See :class:`anndata.h5py.File`.
Open mode of backing file. See :class:`anndata.h5py.File`.
Dictionary with keys as layers' names and values as matrices of the same dimensions as X.
See Also
Multi-dimensional annotations are stored in :attr:`obsm` and :attr:`varm`.
Indexing into an AnnData object with a numeric is supposed to be positional,
like pandas’ :attr:`~pandas.DataFrame.iloc` accessor, while indexing with a string/categorical is
supposed to behave like :attr:`~pandas.DataFrame.loc`.
If the unstructured annotations :attr:`uns` contain a sparse matrix of shape
:attr:`n_obs` × :attr:`n_obs`, these are sliced when calling ``[]``.
A data matrix is flattened if either :attr:`n_obs` or :attr:`n_vars` is 1, so that
numpy's slicing behavior is reproduced::
adata = AnnData(np.ones((2, 2)))
adata[:, 0].X == adata.X[:, 0]
:class:`~anndata.AnnData` stores observations (samples) of variables
(features) in the rows of a matrix. This is the convention of the modern
classics of statistics [Hastie09]_ and machine learning [Murphy12]_, the
convention of dataframes both in R and Python and the established statistics
and machine learning packages in Python (statsmodels_, scikit-learn_).
.. _statsmodels:
.. _scikit-learn:
_BACKED_ATTRS = ['X', 'raw.X']
# backwards compat
'X': {'X', '_X', 'data', '_data'},
'obs': {'obs', '_obs', 'smp', '_smp'},
'var': {'var', '_var'},
'uns': {'uns'},
'obsm': {'obsm', '_obsm', 'smpm', '_smpm'},
'varm': {'varm', '_varm'},
'layers': {'layers', '_layers'},
'obs': {'obs_names', 'smp_names', 'row_names', 'index'},
'var': {'var_names', 'col_names', 'index'},
def __init__(
X: Optional[Union[np.ndarray, sparse.spmatrix, pd.DataFrame]] = None,
obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None,
var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None,
uns: Optional[Mapping[str, Any]] = None,
obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None,
varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None,
layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None,
raw: Optional[Raw] = None,
dtype: Union[np.dtype, str] = 'float32',
shape: Optional[Tuple[int, int]] = None,
filename: Optional[PathLike] = None,
filemode: Optional[str] = None,
asview: bool = False,
# obsp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None,
# varp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None,
oidx: Index1D = None,
vidx: Index1D = None
if asview:
if not isinstance(X, AnnData):
raise ValueError('`X` has to be an AnnData object.')
self._init_as_view(X, oidx, vidx)
X=X, obs=obs, var=var, uns=uns,
obsm=obsm, varm=varm,
# obsp=obsp, varp=varp,
dtype=dtype, shape=shape,
filename=filename, filemode=filemode)
def _init_as_view(self, adata_ref: 'AnnData', oidx: Index, vidx: Index):
if adata_ref.isbacked and adata_ref.isview:
raise ValueError(
'Currently, you cannot index repeatedly into a backed AnnData, '
'that is, you cannot make a view of a view.')
self._isview = True
self._adata_ref = adata_ref
self._oidx = oidx
self._vidx = vidx
# the file is the same as of the reference object
self.file = adata_ref.file
# views on attributes of adata_ref
oidx_normalized, vidx_normalized = oidx, vidx
if isinstance(oidx, (int, np.integer)): oidx_normalized = slice(oidx, oidx+1, 1)
if isinstance(vidx, (int, np.integer)): vidx_normalized = slice(vidx, vidx+1, 1)
obs_sub = adata_ref.obs.iloc[oidx_normalized]
var_sub = adata_ref.var.iloc[vidx_normalized]
self._obsm = adata_ref.obsm._view(self, oidx_normalized)
self._varm = adata_ref.varm._view(self, vidx_normalized)
# TODO: Should this be normalized
self._layers = adata_ref.layers._view(self, (oidx, vidx))
# self._obsp = adata_ref.obsp._view(self, oidx_normalized)
# self._varp = adata_ref.varp._view(self, vidx_normalized)
# hackish solution here, no copy should be necessary
uns_new = deepcopy(self._adata_ref._uns)
# need to do the slicing before setting the updated self._n_obs, self._n_vars
self._n_obs = self._adata_ref.n_obs # use the original n_obs here
self._slice_uns_sparse_matrices_inplace(uns_new, self._oidx)
# fix _n_obs, _n_vars
if isinstance(oidx, slice):
self._n_obs = get_n_items_idx(obs_sub.index, adata_ref.n_obs)
elif isinstance(oidx, (int, np.integer)):
self._n_obs = 1
elif isinstance(oidx, Sized):
self._n_obs = get_n_items_idx(oidx, adata_ref.n_obs)
raise KeyError('Unknown Index type')
if isinstance(vidx, slice):
self._n_vars = get_n_items_idx(var_sub.index, adata_ref.n_vars)
elif isinstance(vidx, (int, np.integer)):
self._n_vars = 1
elif isinstance(vidx, Sized):
self._n_vars = get_n_items_idx(vidx, adata_ref.n_vars)
raise KeyError('Unknown Index type')
# fix categories
self._remove_unused_categories(adata_ref.obs, obs_sub, uns_new)
self._remove_unused_categories(adata_ref.var, var_sub, uns_new)
# set attributes
self._obs = DataFrameView(obs_sub, view_args=(self, 'obs'))
self._var = DataFrameView(var_sub, view_args=(self, 'var'))
self._uns = DictView(uns_new, view_args=(self, 'uns'))
# set data
if self.isbacked:
self._X = None
# set raw, easy, as it's immutable anyways...
if adata_ref._raw is not None:
# slicing along variables axis is ignored
self._raw = adata_ref.raw[oidx]
self._raw = None
def _init_X_as_view(self):
if self._adata_ref.X is None:
self._X = None
X = self._adata_ref._X[self._oidx, self._vidx]
if isinstance(X, sparse.csr_matrix):
self._X = SparseCSRView(X, view_args=(self, 'X'))
elif isinstance(X, sparse.csc_matrix):
self._X = SparseCSCView(X, view_args=(self, 'X'))
elif issparse(X):
raise ValueError('View on non-csr/csc sparse matrices not implemented.')
elif isinstance(X, ZappyArray): # ZappyArray acts as a view itself
self._X = X
shape = (
get_n_items_idx(self._oidx, self._adata_ref.n_obs),
get_n_items_idx(self._vidx, self._adata_ref.n_vars)
if np.isscalar(X):
X = X.view()
self._X = ArrayView(X.reshape(shape), view_args=(self, 'X'))
def _init_as_actual(
self, X=None, obs=None, var=None, uns=None,
obsm=None, varm=None,
# varp=None, obsp=None,
raw=None, layers=None,
dtype='float32', shape=None,
filename=None, filemode=None):
from import _read_args_from_h5ad
# view attributes
self._isview = False
self._adata_ref = None
self._oidx = None
self._vidx = None
# ----------------------------------------------------------------------
# various ways of initializing the data
# ----------------------------------------------------------------------
# init from file
if filename is not None:
if any((X, obs, var, uns, obsm, varm)):
raise ValueError(
'If initializing from `filename`, '
'no further arguments may be passed.')
self.file = AnnDataFileManager(self, filename, filemode)
X, obs, var, uns, obsm, varm, layers, raw = _read_args_from_h5ad(self, mode=filemode)
if X is not None:
# this is not a function that a user would use, hence it's fine to set the dtype
dtype =
self.file = AnnDataFileManager(self, None)
# init from AnnData
if isinstance(X, AnnData):
if any((obs, var, uns, obsm, varm)):
raise ValueError(
'If `X` is a dict no further arguments must be provided.')
X, obs, var, uns, obsm, varm, layers, raw = X._X, X.obs, X.var, X.uns, X.obsm, X.varm, X.layers, X.raw
# init from DataFrame
elif isinstance(X, pd.DataFrame):
if obs is None:
obs = pd.DataFrame(index=X.index)
if not X.index.equals(obs.index):
raise ValueError(
'Index of obs must match index of X.')
if var is None:
var = pd.DataFrame(index=X.columns)
if not X.columns.equals(var.index):
raise ValueError(
'Index of var must match columns of X.')
X = X.values
# ----------------------------------------------------------------------
# actually process the data
# ----------------------------------------------------------------------
# check data type of X
if X is not None:
for s_type in StorageType:
if isinstance(X, s_type.value):
class_names = ', '.join(c.__name__ for c in StorageType.classes())
raise ValueError('`X` needs to be of one of {}, not {}.'
.format(class_names, type(X)))
if shape is not None:
raise ValueError('`shape` needs to be `None` is `X` is not `None`.')
# if type doesn't match, a copy is made, otherwise, use a view
if issparse(X) or isinstance(X, ma.MaskedArray):
# TODO: maybe use view on data attribute of sparse matrix
# as in readwrite.read_10x_h5
if X.dtype != np.dtype(dtype): X = X.astype(dtype)
elif isinstance(X, ZarrArray):
X = X.astype(dtype)
else: # is np.ndarray
X = X.astype(dtype, copy=False)
# data matrix and shape
self._X = X
self._n_obs, self._n_vars = self._X.shape
self._X = None
self._n_obs = len([] if obs is None else obs)
self._n_vars = len([] if var is None else var)
# check consistency with shape
if shape is not None:
if self._n_obs == 0:
self._n_obs = shape[0]
if self._n_obs != shape[0]:
raise ValueError('`shape` is inconsistent with `obs`')
if self._n_vars == 0:
self._n_vars = shape[1]
if self._n_vars != shape[1]:
raise ValueError('`shape` is inconsistent with `var`')
# annotations
self._obs = _gen_dataframe(obs, self._n_obs,
['obs_names', 'row_names', 'smp_names'])
self._var = _gen_dataframe(var, self._n_vars, ['var_names', 'col_names'])
# unstructured annotations
self._uns = uns or OrderedDict()
# TODO: Think about consequences of making obsm a group in hdf
self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm))
self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm))
# self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp))
# self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp))
# raw
if raw is None:
self._raw = None
if isinstance(raw, Raw):
self._raw = raw
# is dictionary from reading the file, nothing that is meant for a user
shape = self.file['raw.X'].shape if self.isbacked else raw['X'].shape
self._raw = Raw(
var=_gen_dataframe(raw['var'], shape[1], ['var_names', 'col_names']),
varm=raw['varm'] if 'varm' in raw else None)
# clean up old formats
# layers
self._layers = Layers(self, layers)
def __sizeof__(self) -> int:
size = 0
for attr in ['_X', '_obs', '_var', '_uns', '_obsm', '_varm']:
s = getattr(self, attr).__sizeof__()
size += s
return size
def _gen_repr(self, n_obs, n_vars) -> str:
if self.isbacked:
backed_at = 'backed at \'{}\''.format(self.filename)
backed_at = ''
descr = (
'AnnData object with n_obs × n_vars = {} × {} {}'
.format(n_obs, n_vars, backed_at))
for attr in ['obs', 'var', 'uns', 'obsm', 'varm', 'layers']: # 'obsp', 'varp',
keys = getattr(self, attr).keys()
if len(keys) > 0:
descr += '\n {}: {}'.format(attr, str(list(keys))[1:-1])
return descr
def __repr__(self) -> str:
if self.isview:
return 'View of ' + self._gen_repr(self.n_obs, self.n_vars)
return self._gen_repr(self.n_obs, self.n_vars)
def shape(self) -> Tuple[int, int]:
r"""Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`)."""
return self.n_obs, self.n_vars
def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]:
"""Data matrix of shape :attr:`n_obs` × :attr:`n_vars`."""
if self.isbacked:
if not self.file.isopen:
X = self.file['X']
if self.isview:
X = X[self._oidx, self._vidx]
return X
if self.n_obs == 1 and self.n_vars == 1:
return self._X[0, 0]
elif self.n_obs == 1 or self.n_vars == 1:
X = self._X
if issparse(self._X): X = self._X.toarray()
return X.flatten()
return self._X
def X(self, value: Optional[Union[np.ndarray, sparse.spmatrix]]):
if value is None:
if self.isview:
raise ValueError('Copy the view before setting the data matrix to `None`.')
if self.isbacked:
raise ValueError('Not implemented.')
self._X = None
var_get = self.n_vars == 1 and self.n_obs == len(value)
obs_get = self.n_obs == 1 and self.n_vars == len(value)
if var_get or obs_get or self.shape == value.shape:
if self.isbacked:
if self.isview:
self.file['X'][self._oidx, self._vidx] = value
self._set_backed('X', value)
if self.isview:
# exit the view if we go from sparse to dense
if (
issparse(value) and not issparse(self._adata_ref._X)
or not issparse(value) and issparse(self._adata_ref._X)
self._X = value
self._adata_ref._X[self._oidx, self._vidx] = value
self._X = value
raise ValueError('Data matrix has wrong shape {}, need to be {}'
.format(value.shape, self.shape))
def layers(self) -> LayersBase:
"""Dictionary-like object with values of the same dimensions as :attr:`X`.
Layers in AnnData have API similar to loompy :ref:`loomlayers`.
Return the layer named ``"unspliced"``::
Create or replace the ``"spliced"`` layer::
adata.layers["spliced"] = ...
Assign the 10th column of layer ``"spliced"`` to the variable a::
a = adata.layers["spliced"][:, 10]
Delete the ``"spliced"`` layer::
del adata.layers["spliced"]
Return layers’ names::
.. warning::
If AnnData is a view, setting subsets of layers modifies the original data.
return self._layers
def raw(self) -> Raw:
"""Store raw version of :attr:`X` and :attr:`var` as ``.raw.X`` and ``.raw.var``.
The :attr:`raw` attribute is initialized with the current content of an object by setting::
adata.raw = adata
Its content can be deleted by setting it back to ``None``::
adata.raw = None
Upon slicing an AnnData object along the observations (row) axis,
:attr:`raw` is also sliced. Slicing an AnnData object along the variables
(columns) axis, leaves :attr:`raw` unaffected. Note that you can call::
adata.raw[:, 'orig_variable_name'].X
to retrieve the data associated with a variable that might have been
filtered out or "compressed away" in :attr:`X`.
return self._raw
def raw(self, value: Optional['AnnData']):
if not (isinstance(value, AnnData) or value is None):
raise ValueError(
'Can only init raw attribute with an AnnData object or `None`.')
if value is None:
self._raw = None
if self.isview:
self._raw = Raw(value)
def n_obs(self) -> int:
"""Number of observations."""
return self._n_obs
def n_vars(self) -> int:
"""Number of variables/features."""
return self._n_vars
def obs(self) -> pd.DataFrame:
"""One-dimensional annotation of observations (`pd.DataFrame`)."""
return self._obs
def obs(self, value: pd.DataFrame):
if not isinstance(value, pd.DataFrame):
raise ValueError('Can only assign pd.DataFrame.')
if len(value) != self.n_obs:
raise ValueError('Length does not match.')
if self.isview:
self._obs = value
def var(self) -> pd.DataFrame:
"""One-dimensional annotation of variables/ features (`pd.DataFrame`)."""
return self._var
def var(self, value: pd.DataFrame):
if not isinstance(value, pd.DataFrame):
raise ValueError('Can only assign pd.DataFrame.')
if len(value) != self.n_vars:
raise ValueError('Length does not match.')
if self.isview:
self._var = value
def uns(self) -> MutableMapping:
"""Unstructured annotation (ordered dictionary)."""
return self._uns
def uns(self, value: MutableMapping):
if not isinstance(value, MutableMapping):
raise ValueError('Only mutable mapping types (e.g. dict) are allowed for `.uns`.')
if self.isview:
self._uns = value
def obsm(self) -> AxisArraysBase:
"""Multi-dimensional annotation of observations (mutable structured :class:`~numpy.ndarray`).
Stores for each key, a two or higher-dimensional :class:`np.ndarray` of length
``n_obs``. Is sliced with ``data`` and ``obs`` but behaves otherwise like a
return self._obsm
def obsm(self, value):
obsm = AxisArrays(self, 0, vals=convert_to_dict(value))
if self.isview:
self._obsm = obsm
def varm(self) -> AxisArraysBase:
"""Multi-dimensional annotation of variables/ features (mutable structured :class:`~numpy.ndarray`).
Stores for each key, a two or higher-dimensional :class:`~numpy.ndarray` of length
``n_vars``. Is sliced with ``data`` and ``var`` but behaves otherwise like a
return self._varm
def varm(self, value):
varm = AxisArrays(self, 1, vals=convert_to_dict(value))
if self.isview:
self._varm = varm
# @property
# def obsp(self) -> PairwiseArraysBase:
# """Pairwise annotation of observations, a mutable mapping with array-like values.
# Stores for each key, a two or higher-dimensional :class:`np.ndarray` whose
# first two dimensions are of length ``n_obs``. Is sliced with ``data`` and
# ``obs`` but behaves otherwise like a :class:`dict`.
# """
# return self._obsp
# @obsp.setter
# def obsp(self, value):
# obsp = PairwiseArrays(self, 0, vals=convert_to_dict(value))
# if self.isview:
# self._init_as_actual(self.copy())
# self._obsp = obsp
# @property
# def varp(self) -> PairwiseArraysBase:
# """Pairwise annotation of observations, a mutable mapping with array-like values.
# Stores for each key, a two or higher-dimensional :class:`np.ndarray` whose
# first two dimensions are of length ``n_var``. Is sliced with ``data`` and
# ``var`` but behaves otherwise like a :class:`dict`.
# """
# return self._varp
# @varp.setter
# def varp(self, value):
# varp = PairwiseArrays(self, 1, vals=convert_to_dict(value))
# if self.isview:
# self._init_as_actual(self.copy())
# self._varp = varp
def obs_names(self) -> pd.Index:
"""Names of observations (alias for ``.obs.index``)."""
return self.obs.index
def obs_names(self, names: Sequence[str]):
self._obs.index = names
if not self._obs.index.is_unique:
def var_names(self) -> pd.Index:
"""Names of variables (alias for ``.var.index``)."""
return self._var.index
def var_names(self, names: Sequence[str]):
self._var.index = names
if not self._var.index.is_unique:
def obs_keys(self) -> List[str]:
"""List keys of observation annotation :attr:`obs`."""
return self._obs.keys().tolist()
def var_keys(self) -> List[str]:
"""List keys of variable annotation :attr:`var`."""
return self._var.keys().tolist()
def obsm_keys(self) -> List[str]:
"""List keys of observation annotation :attr:`obsm`."""
return list(self._obsm.keys())
def varm_keys(self) -> List[str]:
"""List keys of variable annotation :attr:`varm`."""
return list(self._varm.keys())
def uns_keys(self) -> List[str]:
"""List keys of unstructured annotation."""
return sorted(list(self._uns.keys()))
def isbacked(self) -> bool:
"""``True`` if object is backed on disk, ``False`` otherwise."""
return self.filename is not None
def isview(self) -> bool:
"""``True`` if object is view of another AnnData object, ``False`` otherwise."""
return self._isview
def filename(self) -> Optional[PathLike]:
"""Change to backing mode by setting the filename of a ``.h5ad`` file.
- Setting the filename writes the stored data to disk.
- Setting the filename when the filename was previously another name
moves the backing file from the previous file to the new file. If you
want to copy the previous file, use ``copy(filename='new_filename')``.
return self.file.filename
def filename(self, filename: Optional[PathLike]):
# convert early for later comparison
filename = None if filename is None else Path(filename)
# change from backing-mode back to full loading into memory
if filename is None:
if self.filename is not None:
# both filename and self.filename are None
# do nothing
if self.filename is not None:
if self.filename != filename:
# write the content of self to the old file
# and close the file
# do nothing
# change from memory to backing-mode
# write the content of self to disk
self.write(filename, force_dense=True)
# open new file for accessing, 'r+')
# as the data is stored on disk, we can safely set self._X to None
self._X = None
def _set_backed(self, attr, value):
if (not isinstance(self.file[attr], h5py.SparseDataset)
and not issparse(value)):
self.file[attr] = value
del self.file[attr]
self.file._file.create_dataset(attr, data=value)
def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]:
# deal with tuples of length 1
if isinstance(index, tuple) and len(index) == 1:
index = index[0]
# deal with pd.Series
if isinstance(index, pd.Series):
index: Index = index.values
if isinstance(index, tuple):
if len(index) > 2:
raise ValueError(
'AnnData can only be sliced in rows and columns.')
# deal with pd.Series
if isinstance(index[1], pd.Series):
index = index[0], index[1].values
if isinstance(index[0], pd.Series):
index = index[0].values, index[1]
no_slice = not any(isinstance(i, slice) for i in index)
both_scalars = all(isinstance(i, (int, str, type(None))) for i in index)
if no_slice and not both_scalars:
raise NotImplementedError(
'Slicing with two indices at the same time is not yet implemented. '
'As a workaround, do row and column slicing succesively.')
# Speed up and error prevention for boolean indices (Don’t convert to integer indices)
# Needs to be refactored once we support a tuple of two arbitrary index types
if any(isinstance(i, np.ndarray) and i.dtype == bool for i in index):
return index
obs, var = unpack_index(index)
obs = _normalize_index(obs, self.obs_names)
var = _normalize_index(var, self.var_names)
return obs, var
# TODO: this is not quite complete...
def __delitem__(self, index: Index):
obs, var = self._normalize_indices(index)
# TODO: does this really work?
if not self.isbacked:
del self._X[obs, var]
X = self.file['X']
del X[obs, var]
self._set_backed('X', X)
if var == slice(None):
del self._obs.iloc[obs, :]
if obs == slice(None):
del self._var.iloc[var, :]
def __getitem__(self, index: Index) -> 'AnnData':
"""Returns a sliced view of the object."""
oidx, vidx = self._normalize_indices(index)
return AnnData(self, oidx=oidx, vidx=vidx, asview=True)
def _remove_unused_categories(self, df_full, df_sub, uns):
from pandas.api.types import is_categorical
for k in df_full:
if is_categorical(df_full[k]):
all_categories = df_full[k].cat.categories
# also correct the colors...
if k + '_colors' in uns:
# this is a strange hack...
if np.array(uns[k + '_colors']).ndim == 0:
uns[k + '_colors'] = np.array(uns[k + '_colors'])[None]
uns[k + '_colors'] = np.array(uns[k + '_colors'])[
all_categories, df_sub[k].cat.categories))[0]]
def rename_categories(self, key: str, categories: Sequence[Any]):
"""Rename categories of annotation ``key`` in
:attr:`obs`, :attr:`var` and :attr:`uns`.
Only supports passing a list/array-like ``categories`` argument.
Besides calling ``self.obs[key].cat.categories = categories`` -
similar for :attr:`var` - this also renames categories in unstructured
annotation that uses the categorical annotation ``key``.
Key for observations or variables annotation.
New categories, the same number as the old categories.
if isinstance(categories, Mapping):
raise ValueError('Only list-like `categories` is supported.')
if key in self.obs:
old_categories = self.obs[key].cat.categories.tolist()
self.obs[key].cat.rename_categories(categories, inplace=True)
elif key in self.var:
old_categories = self.var[key].cat.categories.tolist()
self.var[key].cat.rename_categories(categories, inplace=True)
raise ValueError('{} is neither in `.obs` nor in `.var`.'
# this is not a good solution
# but depends on the scanpy conventions for storing the categorical key
# as `groupby` in the `params` slot
for k1, v1 in self.uns.items():
if isinstance(v1, Mapping):
if 'params' in v1 and 'groupby' in v1['params']:
if v1['params']['groupby'] == key:
for k2, v2 in v1.items():
# picks out the recarrays that are named according to the old
# categories
if isinstance(v2, np.ndarray) and v2.dtype.names is not None:
if list(v2.dtype.names) == old_categories:
self.uns[k1][k2].dtype.names = categories
'Omitting {}/{} as old categories do not match.'
.format(k1, k2))
def strings_to_categoricals(self, df: Optional[pd.DataFrame] = None):
"""Transform string annotations to categoricals.
Only affects string annotations that lead to less categories than the
total number of observations.
If ``df`` is ``None``, modifies both :attr:`obs` and :attr:`.var`,
otherwise modifies ``df`` inplace.
Turns the view of an :class:`~anndata.AnnData` into an actual
dont_modify = False # only necessary for backed views
if df is None:
dfs = [self.obs, self.var]
if self.isview:
if not self.isbacked:
dont_modify = True
dfs = [df]
for df in dfs:
string_cols = [
key for key in df.columns
if is_string_dtype(df[key])
and not is_categorical(df[key])
for key in string_cols:
# make sure we only have strings (could be that there are
# np.nans (float), -666, '-666', for instance)
c = df[key].astype('U')
# make a categorical
c = pd.Categorical(c, categories=natsorted(np.unique(c)))
if len(c.categories) < len(c):
if dont_modify:
raise RuntimeError(
'Please call `.strings_to_categoricals()` on full AnnData, not on this view. '
'You might encounter this error message while copying or writing to disk.'
df[key] = c'... storing {!r} as categorical'.format(key))
_sanitize = strings_to_categoricals # backwards compat
def _slice_uns_sparse_matrices_inplace(self, uns: MutableMapping, oidx: Index1D):
# slice sparse spatrices of n_obs × n_obs in self.uns
if not (isinstance(oidx, slice) and
oidx.start is None and oidx.step is None and oidx.stop is None):
for k, v in uns.items():
# treat nested dicts
if isinstance(v, Mapping):
self._slice_uns_sparse_matrices_inplace(v, oidx)
if isinstance(v, sparse.spmatrix) and v.shape == (
self.n_obs, self.n_obs):
uns[k] = v.tocsc()[:, oidx].tocsr()[oidx, :]
def _inplace_subset_var(self, index: Index1D):
"""Inplace subsetting along variables dimension.
Same as ``adata = adata[:, index]``, but inplace.
adata_subset = self[:, index].copy()
self._init_as_actual(adata_subset, dtype=self._X.dtype)
def _inplace_subset_obs(self, index: Index1D):
"""Inplace subsetting along variables dimension.
Same as ``adata = adata[index, :]``, but inplace.
adata_subset = self[index].copy()
self._init_as_actual(adata_subset, dtype=self._X.dtype)
def __setitem__(self, index: Index, val: Union[int, float, np.ndarray, sparse.spmatrix]):
if self.isview:
raise ValueError('Object is view and cannot be accessed with `[]`.')
obs, var = self._normalize_indices(index)
if not self.isbacked:
self._X[obs, var] = val
X = self.file['X']
X[obs, var] = val
self._set_backed('X', X)
def __len__(self) -> int:
return self.shape[0]
def transpose(self) -> 'AnnData':
"""Transpose whole object.
Data matrix is transposed, observations and variables are interchanged.
if not self.isbacked: X = self._X
else: X = self.file['X']
if self.isview:
raise ValueError(
'You\'re trying to transpose a view of an `AnnData`, which is currently not implemented. '
'Call `.copy()` before transposing.')
def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix:
return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T
return AnnData(
t_csr(X), self._var, self._obs, self._uns,
self._varm.flipped(), self._obsm.flipped(),
layers={k: t_csr(v) for k, v in self.layers.items()},,
T = property(transpose)
def to_df(self) -> pd.DataFrame:
"""Generate shallow :class:`~pandas.DataFrame`.
The data matrix :attr:`X` is returned as
:class:`~pandas.DataFrame`, where :attr:`obs_names` initializes the
index, and :attr:`var_names` the columns.
* No annotations are maintained in the returned object.
* The data matrix is densified in case it is sparse.
if issparse(self._X):
X = self._X.toarray()
X = self._X
return pd.DataFrame(X, index=self.obs_names, columns=self.var_names)
def _get_X(self, use_raw=False, layer=None):
Convenience method for getting expression values with common arguments and error handling.
is_layer = layer is not None
if use_raw and is_layer:
raise ValueError(
"Cannot use expression from both layer and raw. You provided:"
"'use_raw={}' and 'layer={}'".format(use_raw, layer)
if is_layer:
return self.layers[layer]
elif use_raw:
if self.raw is None:
raise ValueError("This AnnData doesn't have a value in `.raw`.")
return self.raw.X
return self.X
def obs_vector(
self, k: str, *, layer: Optional[str] = None
) -> np.ndarray:
Convenience function for returning a 1 dimensional ndarray of values
from `.X`, `.layers[k]`, or `.obs`.
Made for convenience, not performance. Intentionally permissive about
arguments, for easy iterative use.
Key to use. Should be in `.var_names` or `.obs.columns`.
What layer values should be returned from. If `None`, `.X` is used.
A one dimensional nd array, with values for each obs in the same order
as `.obs_names`.
if layer == "X":
if "X" in self.layers:
"In a future version of AnnData, access to `.X` by passing"
" `layer='X'` will be removed. Instead pass `layer=None`.",
layer = None
if k in self.obs:
return self.obs[k].values
idx = self._normalize_indices((slice(None), k))
a = self._get_X(layer=layer)[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
def var_vector(
self, k, *, layer: Optional[str] = None
) -> np.ndarray:
Convenience function for returning a 1 dimensional ndarray of values
from `.X`, `.layers[k]`, or `.obs`.
Made for convenience, not performance. Intentionally permissive about
arguments, for easy iterative use.
Key to use. Should be in `.obs_names` or `.var.columns`.
What layer values should be returned from. If `None`, `.X` is used.
A one dimensional nd array, with values for each var in the same order
as `.var_names`.
if layer == "X":
if "X" in self.layers:
"In a future version of AnnData, access to `.X` by passing"
" `layer='X'` will be removed. Instead pass `layer=None`.",
layer = None
if k in self.var:
return self.var[k].values
idx = self._normalize_indices((k, slice(None)))
a = self._get_X(layer=layer)[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
def _get_obs_array(self, k, use_raw=False, layer=None):
"""Get an array from the layer (default layer='X') along the observation dimension by first looking up
obs.keys and then var.index."""
if not use_raw or k in self.obs.columns:
return self.obs_vector(k=k, layer=layer)
return self.raw.obs_vector(k)
def _get_var_array(self, k, use_raw=False, layer=None):
"""Get an array from the layer (default layer='X') along the variables dimension by first looking up
``var.keys`` and then ``obs.index``."""
if not use_raw or k in self.var.columns:
return self.var_vector(k=k, layer=layer)
return self.raw.var_vector(k)
def copy(self, filename: Optional[PathLike] = None) -> 'AnnData':
"""Full copy, optionally on disk."""
if not self.isbacked:
return AnnData(self._X.copy() if self._X is not None else None,
# deepcopy on DictView does not work and is unnecessary
# as uns was copied already before
self._uns.copy() if isinstance(self._uns, DictView) else deepcopy(self._uns),
self._obsm.copy(), self._varm.copy(),
raw=None if self._raw is None else self._raw.copy(),
layers=dict(self.layers), if self._X is not None else 'float32')
if filename is None:
raise ValueError(
'To copy an AnnData object in backed mode, '
'pass a filename: `.copy(filename=\'myfilename.h5ad\')`.')
if self.isview:
from shutil import copyfile
copyfile(self.filename, filename)
return AnnData(filename=filename)
def concatenate(
self, *adatas: 'AnnData',
join: str = 'inner',
batch_key: str = 'batch',
batch_categories: Sequence[Any] = None,
index_unique: Optional[str] = '-'
) -> 'AnnData':
"""Concatenate along the observations axis.
The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored.
Currently, this works only in ``'memory'`` mode.
AnnData matrices to concatenate with. Each matrix is referred to as
a “batch”.
Use intersection (``'inner'``) or union (``'outer'``) of variables.
Add the batch annotation to :attr:`obs` using this key.
Use these as categories for the batch annotation. By default, use increasing numbers.
Make the index unique by joining the existing index names with the
batch category, using ``index_unique='-'``, for instance. Provide
``None`` to keep existing indices.
The concatenated :class:`~anndata.AnnData`, where ``adata.obs[batch_key]``
stores a categorical variable labeling the batch.
.. warning::
If you use ``join='outer'`` this fills 0s for sparse data when
variables are absent in a batch. Use this with care. Dense data is
filled with ``NaN``. See the examples.
Joining on intersection of variables.
>>> adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
>>> {'obs_names': ['s1', 's2'],
>>> 'anno1': ['c1', 'c2']},
>>> {'var_names': ['a', 'b', 'c'],
>>> 'annoA': [0, 1, 2]})
>>> adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
>>> {'obs_names': ['s3', 's4'],
>>> 'anno1': ['c3', 'c4']},
>>> {'var_names': ['d', 'c', 'b'],
>>> 'annoA': [0, 1, 2]})
>>> adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
>>> {'obs_names': ['s1', 's2'],
>>> 'anno2': ['d3', 'd4']},
>>> {'var_names': ['d', 'c', 'b'],
>>> 'annoA': [0, 2, 3],
>>> 'annoB': [0, 1, 2]})
>>> adata = adata1.concatenate(adata2, adata3)
>>> adata
AnnData object with n_obs × n_vars = 6 × 2
obs_keys = ['anno1', 'anno2', 'batch']
var_keys = ['annoA-0', 'annoA-1', 'annoB-2', 'annoA-2']
>>> adata.X
array([[2., 3.],
[5., 6.],
[3., 2.],
[6., 5.],
[3., 2.],
[6., 5.]], dtype=float32)
>>> adata.obs
anno1 anno2 batch
s1-0 c1 NaN 0
s2-0 c2 NaN 0
s3-1 c3 NaN 1
s4-1 c4 NaN 1
s1-2 NaN d3 2
s2-2 NaN d4 2
>>> adata.var.T
b c
annoA-0 1 2
annoA-1 2 1
annoB-2 2 1
annoA-2 3 2
Joining on the union of variables.
>>> adata = adata1.concatenate(adata2, adata3, join='outer')
>>> adata
AnnData object with n_obs × n_vars = 6 × 4
obs_keys = ['anno1', 'anno2', 'batch']
var_keys = ['annoA-0', 'annoA-1', 'annoB-2', 'annoA-2']
>>> adata.var.T
index a b c d
annoA-0 0.0 1.0 2.0 NaN
annoA-1 NaN 2.0 1.0 0.0
annoB-2 NaN 2.0 1.0 0.0
annoA-2 NaN 3.0 2.0 0.0
>>> adata.var_names
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> adata.X
array([[ 1., 2., 3., nan],
[ 4., 5., 6., nan],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.]], dtype=float32)
>>> adata.X.sum(axis=0)
array([nan, 25., 23., nan], dtype=float32)
>>> import pandas as pd
>>> Xdf = pd.DataFrame(adata.X, columns=adata.var_names)
index a b c d
0 1.0 2.0 3.0 NaN
1 4.0 5.0 6.0 NaN
2 NaN 3.0 2.0 1.0
3 NaN 6.0 5.0 4.0
4 NaN 3.0 2.0 1.0
5 NaN 6.0 5.0 4.0
>>> Xdf.sum()
a 5.0
b 25.0
c 23.0
d 10.0
dtype: float32
>>> from numpy import ma
>>> adata.X = ma.masked_invalid(adata.X)
>>> adata.X
data=[[1.0, 2.0, 3.0, --],
[4.0, 5.0, 6.0, --],
[--, 3.0, 2.0, 1.0],
[--, 6.0, 5.0, 4.0],
[--, 3.0, 2.0, 1.0],
[--, 6.0, 5.0, 4.0]],
mask=[[False, False, False, True],
[False, False, False, True],
[ True, False, False, False],
[ True, False, False, False],
[ True, False, False, False],
[ True, False, False, False]],
>>> adata.X.sum(axis=0).data
array([ 5., 25., 23., 10.], dtype=float32)
The masked array is not saved but has to be reinstantiated after saving.
>>> adata.write('./test.h5ad')
>>> from anndata import read_h5ad
>>> adata = read_h5ad('./test.h5ad')
>>> adata.X
array([[ 1., 2., 3., nan],
[ 4., 5., 6., nan],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.]], dtype=float32)
For sparse data, everything behaves similarly, except that for ``join='outer'``, zeros are added.
>>> from scipy.sparse import csr_matrix
>>> adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
>>> {'obs_names': ['s1', 's2'],
>>> 'anno1': ['c1', 'c2']},
>>> {'var_names': ['a', 'b', 'c']})
>>> adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
>>> {'obs_names': ['s3', 's4'],
>>> 'anno1': ['c3', 'c4']},
>>> {'var_names': ['d', 'c', 'b']})
>>> adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]),
>>> {'obs_names': ['s5', 's6'],
>>> 'anno2': ['d3', 'd4']},
>>> {'var_names': ['d', 'c', 'b']})
>>> adata = adata1.concatenate(adata2, adata3, join='outer')
>>> adata.var_names
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> adata.X.toarray()
array([[0., 2., 3., 0.],
[0., 5., 6., 0.],
[0., 3., 2., 0.],
[0., 6., 5., 0.],
[0., 0., 2., 1.],
[0., 6., 5., 0.]], dtype=float32)
if self.isbacked:
raise ValueError(
'Currently, concatenate does only work in \'memory\' mode.')
if len(adatas) == 0:
return self
elif len(adatas) == 1 and not isinstance(adatas[0], AnnData):
adatas = adatas[0] # backwards compatibility
all_adatas = (self,) + tuple(adatas)
# for controlled behavior, make all variable names unique
printed_info = False
for i, ad in enumerate(all_adatas):
if not ad.var_names.is_unique:
ad.var_names = utils.make_index_unique(ad.var_names)
if not printed_info:
'Making variable names unique for controlled concatenation.')
printed_info = True
# define variable names of joint AnnData
mergers = dict(inner=set.intersection, outer=set.union)
var_names_reduce = reduce(mergers[join], (set(ad.var_names) for ad in all_adatas))
# restore order of initial var_names, append non-sortable names at the end
# see how this was done in the repo at commit state
# 40a24f
var_names = []
for v in all_adatas[0].var_names:
if v in var_names_reduce:
var_names_reduce.remove(v) # update the set
var_names = pd.Index(var_names + list(var_names_reduce))
if batch_categories is None:
categories = [str(i) for i, _ in enumerate(all_adatas)]
elif len(batch_categories) == len(all_adatas):
categories = batch_categories
raise ValueError('Provide as many `batch_categories` as `adatas`.')
out_shape = (sum(a.n_obs for a in all_adatas), len(var_names))
any_sparse = any(issparse(a.X) for a in all_adatas)
if join == 'outer':
if any_sparse: # not sure whether the lil_matrix is really the best option
X = sparse.lil_matrix(out_shape, dtype=self.X.dtype)
X = np.empty(out_shape, dtype=self.X.dtype)
X[:] = np.nan
Xs = []
# create layers dict that contains layers shared among all AnnDatas
layers = OrderedDict()
shared_layers = [key for key in all_adatas[0].layers.keys()
if all([key in ad.layers.keys() for ad in all_adatas])]
for key in shared_layers:
layers[key] = []
# check whether tries to do 'outer' join and layers is non_empty.
if join == 'outer' and len(shared_layers) > 0:
'layers concatenation is not yet available for \'outer\' intersection and will be ignored.')
# check whether layers are not consistently set in all AnnData objects.
n_layers = np.array([len(ad.layers.keys()) for ad in all_adatas])
if join == 'inner' and not all(len(shared_layers) == n_layers):
'layers are inconsistent - only layers that are shared among all AnnData objects are included.')
var = pd.DataFrame(index=var_names)
if join == 'inner':
ad_ref = all_adatas[0]
cols_intersect = set(ad_ref.var.columns)
for ad in all_adatas[1:]:
cols_intersect &= set(ad.var.columns)
cols_intersect = {
col for col in cols_intersect
if ad_ref.var.loc[var_names, col].equals(ad.var.loc[var_names, col])
if not cols_intersect: break
obs_i = 0 # start of next adata’s observations in X
out_obss = []
for i, ad in enumerate(all_adatas):
if join == 'outer':
# only those names that are actually present in the current AnnData
vars_intersect = [v for v in var_names if v in ad.var_names]
vars_intersect = var_names
# X
if join == 'outer':
# this is pretty slow, I guess sparse matrices shouldn't be
# constructed like that
var_names.isin(vars_intersect)] = ad[:, vars_intersect].X
Xs.append(ad[:, vars_intersect].X)
obs_i += ad.n_obs
# layers
if join == 'inner':
for key in shared_layers:
layers[key].append(ad[:, vars_intersect].layers[key])
# obs
obs = ad.obs.copy()
obs[batch_key] = pd.Categorical(ad.n_obs * [categories[i]], categories)
if (is_string_dtype(all_adatas[0].obs.index) and not
obs.index = obs.index.astype(str)
if index_unique is not None:
if not is_string_dtype(ad.obs.index):
obs.index = obs.index.astype(str)
obs.index = obs.index.values + index_unique + categories[i]
# var
for c in ad.var.columns:
if join == 'inner' and c in cols_intersect:
if c not in var.columns:
var.loc[vars_intersect, c] = ad.var.loc[vars_intersect, c]
new_c = c + (index_unique if index_unique is not None else '-') + categories[i]
var.loc[vars_intersect, new_c] = ad.var.loc[vars_intersect, c]
if join == 'inner':
from scipy.sparse import vstack
if any_sparse:
X = vstack(Xs)
X = np.concatenate(Xs)
for key in shared_layers:
if any(issparse(a.layers[key]) for a in all_adatas):
layers[key] = vstack(layers[key])
layers[key] = np.concatenate(layers[key])
obs = pd.concat(out_obss, sort=True)
if any_sparse:
sparse_format = all_adatas[0].X.getformat()
X = X.asformat(sparse_format)
new_adata = AnnData(X, obs, var, layers=layers) if join == 'inner' else AnnData(X, obs, var)
if not obs.index.is_unique:
'Or pass `index_unique!=None` to `.concatenate`.')
return new_adata
def var_names_make_unique(self, join: str = '-'):
self.var.index = utils.make_index_unique(self.var.index, join)
var_names_make_unique.__doc__ = utils.make_index_unique.__doc__
def obs_names_make_unique(self, join: str = '-'):
self.obs.index = utils.make_index_unique(self.obs.index, join)
obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__
def _check_uniqueness(self):
if not self.obs.index.is_unique:
if not self.var.index.is_unique:
def __contains__(self, key: Any):
raise AttributeError('AnnData has no attribute __contains__, '
'don\'t check `in adata`.')
def _check_dimensions(self, key=None):
if key is None:
key = {'obs', 'var', 'obsm', 'varm'}
key = {key}
if 'obs' in key and len(self._obs) != self._n_obs:
raise ValueError('Observations annot. `obs` must have number of '
'rows of `X` ({}), but has {} rows.'
.format(self._n_obs, self._obs.shape[0]))
if 'var' in key and len(self._var) != self._n_vars:
raise ValueError('Variables annot. `var` must have number of '
'columns of `X` ({}), but has {} rows.'
.format(self._n_vars, self._var.shape[0]))
if 'obsm' in key:
obsm = self._obsm
if not all([o.shape[0] == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs:
raise ValueError('Observations annot. `obsm` must have number of '
'rows of `X` ({}), but has {} rows.'
.format(self._n_obs, len(self._obsm)))
if 'varm' in key:
varm = self._varm
if not all([v.shape[0] == self._n_vars for v in varm.values()]) and len(varm.dim_names) != self._n_var:
raise ValueError('Variables annot. `varm` must have number of '
'columns of `X` ({}), but has {} rows.'
.format(self._n_vars, len(self._varm)))
def write_h5ad(
filename: Optional[PathLike] = None,
compression: Optional[str] = None,
compression_opts: Union[int, Any] = None,
force_dense: Optional[bool] = None
"""Write ``.h5ad``-formatted hdf5 file.
.. note::
Setting compression to ``'gzip'`` can save disk space but
will slow down writing and subsequent reading. Prior to
v0.6.16, this was the default for parameter
Generally, if you have sparse data that are stored as a dense
matrix, you can dramatically improve performance and reduce
disk space by converting to a :class:`~scipy.sparse.csr_matrix`::
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
Filename of data file. Defaults to backing file.
compression : ``None``, {``'gzip'``, ``'lzf'``} (default: ``None``)
See the h5py :ref:`dataset_compression`.
See the h5py :ref:`dataset_compression`.
Write sparse data as a dense matrix. Defaults to ``True`` if object is
backed, otherwise to ``False``.
from ..readwrite.write import _write_h5ad
if filename is None and not self.isbacked:
raise ValueError('Provide a filename!')
if filename is None:
filename = self.filename
if force_dense is None:
force_dense = self.isbacked
_write_h5ad(filename, self, compression=compression,
compression_opts=compression_opts, force_dense=force_dense)
if self.isbacked:
write = write_h5ad # a shortcut and backwards compat
def write_csvs(self, dirname: PathLike, skip_data: bool = True, sep: str = ','):
"""Write annotation to ``.csv`` files.
It is not possible to recover the full :class:`~anndata.AnnData` from the
output of this function. Use :meth:`~anndata.AnnData.write` for this.
Name of directory to which to export.
Skip the data matrix :attr:`X`.
Separator for the data.
from ..readwrite.write import write_csvs
write_csvs(dirname, self, skip_data=skip_data, sep=sep)
def write_loom(self, filename: PathLike, write_obsm_varm: bool = False):
"""Write ``.loom``-formatted hdf5 file.
The filename.
from ..readwrite.write import write_loom
write_loom(filename, self, write_obsm_varm = write_obsm_varm)
def write_zarr(
store: Union[MutableMapping, PathLike],
chunks: Union[bool, int, Tuple[int, ...]],
"""Write a hierarchical Zarr array store.
The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class.
Chunk shape.
from ..readwrite.write import write_zarr
write_zarr(store, self, chunks=chunks)
def chunked_X(self, chunk_size: Optional[int] = None):
"""Return an iterator over the rows of the data matrix :attr:`X`.
Row size of a single chunk.
if chunk_size is None:
# Should be some adaptive code
chunk_size = 6000
start = 0
n = self.n_obs
for _ in range(int(n // chunk_size)):
end = start + chunk_size
yield (self.X[start:end], start, end)
start = end
if start < n:
yield (self.X[start:n], start, n)
def chunk_X(
select: Union[int, List[int], Tuple[int, ...], np.ndarray] = 1000,
replace: bool = True,
"""Return a chunk of the data matrix :attr:`X` with random or specified indices.
If select is an integer, a random chunk of row size = select will be returned.
If select is a list, tuple or numpy array of integers, then a chunk
with these indices will be returned.
If select is an integer then ``replace=True`` specifies random sampling of indices
with replacement, ``replace=False`` - without replacement.
if isinstance(select, int):
select = select if select < self.n_obs else self.n_obs
choice = np.random.choice(self.n_obs, select, replace)
elif isinstance(select, (np.ndarray, list, tuple)):
choice = np.asarray(select)
raise ValueError('select should be int or array')
reverse = None
if self.isbacked:
# h5py can only slice with a sorted list of unique index values
# so random batch with indices [2, 2, 5, 3, 8, 10, 8] will fail
# this fixes the problem
indices, reverse = np.unique(choice, return_inverse=True)
selection = self.X[indices.tolist()]
selection = self.X[choice]
selection = selection.toarray() if issparse(selection) else selection
return selection if reverse is None else selection[reverse]
def _args_from_dict(ddata: Mapping[str, Any]):
"""Allows to construct an instance of AnnData from a dictionary.
Acts as interface for the communication with the hdf5 file.
In particular, from a dict that has been written using
d_true_keys = {}
# backwards compat
uns_is_not_key = False
valid_keys = []
for keys in AnnData._H5_ALIASES.values():
valid_keys += keys
valid_keys += ['raw.X', 'raw.var', 'raw.varm', '']
for key in ddata.keys():
# if there is another key then the prdedefined
# then we are reading the old format
if key not in valid_keys:
uns_is_not_key = True
for true_key, keys in AnnData._H5_ALIASES.items():
for key in keys:
if key in ddata:
d_true_keys[true_key] = ddata[key]
if uns_is_not_key: del ddata[key]
d_true_keys[true_key] = None
# transform recarray to dataframe
for true_key, keys in AnnData._H5_ALIASES_NAMES.items():
if d_true_keys[true_key] is not None:
for key in keys:
if key in d_true_keys[true_key].dtype.names:
d_true_keys[true_key] = pd.DataFrame.from_records(
d_true_keys[true_key], index=key)
d_true_keys[true_key].index = d_true_keys[true_key].index.astype('U')
# transform to unicode string
# TODO: this is quite a hack
for c in d_true_keys[true_key].columns:
if is_string_dtype(d_true_keys[true_key][c]):
d_true_keys[true_key][c] = pd.Index(
# these are the category fields
k_to_delete = []
items = (
ddata.items() if uns_is_not_key
else ddata['uns'].items() if 'uns' in ddata else []
for k, v in items:
if k.endswith('_categories'):
k_stripped = k.replace('_categories', '')
if isinstance(v, (str, int)): # fix categories with a single category
v = [v]
for ann in ['obs', 'var']:
if k_stripped in d_true_keys[ann]:
d_true_keys[ann][k_stripped] = pd.Categorical.from_codes(
for k in k_to_delete:
if uns_is_not_key:
del ddata[k]
del ddata['uns'][k]
# assign the variables
X = d_true_keys['X']
obs = d_true_keys['obs']
obsm = d_true_keys['obsm']
var = d_true_keys['var']
varm = d_true_keys['varm']
layers = d_true_keys['layers']
raw = None
if 'raw.X' in ddata:
raw = {}
raw['X'] = ddata['raw.X']
del ddata['raw.X']
# get the dataframe
raw['var'] = pd.DataFrame.from_records(
ddata['raw.var'], index='index')
del ddata['raw.var']
raw['var'].index = raw['var'].index.astype('U')
# transform to unicode string
for c in raw['var'].columns:
if is_string_dtype(raw['var'][c]):
raw['var'][c] = pd.Index(raw['var'][c]).astype('U').values
# these are the category fields
if '' in ddata: # old h5ad didn't have that field
for k, v in ddata[''].items():
if k.endswith('_categories'):
k_stripped = k.replace('_categories', '')
if isinstance(v, (str, int)): # fix categories with a single category
v = [v]
raw['var'][k_stripped] = pd.Categorical.from_codes(
del ddata['']
if 'raw.varm' in ddata:
raw['varm'] = ddata['raw.varm']
del ddata['raw.varm']
elif raw is not None:
raw['varm'] = None
# the remaining fields are the unstructured annotation
uns = (
ddata if uns_is_not_key
else ddata['uns'] if 'uns' in ddata
else {}
return X, obs, var, uns, obsm, varm, layers, raw
def _to_dict_fixed_width_arrays(self, var_len_str=True):
"""A dict of arrays that stores data and annotation.
It is sufficient for reconstructing the object.
obs_rec, uns_obs = df_to_records_fixed_width(self._obs, var_len_str)
var_rec, uns_var = df_to_records_fixed_width(self._var, var_len_str)
layers = dict(self.layers)
d = {
'X': self._X,
'obs': obs_rec,
'var': var_rec,
'obsm': self._obsm,
'varm': self._varm,
'layers': layers,
# add the categories to the unstructured annotation
'uns': {**self._uns, **uns_obs, **uns_var}}
if self.raw is not None:
var_rec, uns_var = df_to_records_fixed_width(self.raw._var, var_len_str)
d['raw.X'] = self.raw.X
d['raw.var'] = var_rec
d['raw.varm'] = self.raw.varm
d[''] = uns_var
return d
# --------------------------------------------------------------------------
# all of the following is for backwards compat
# --------------------------------------------------------------------------
def data(self):
return self.X
def data(self, value):
self.X = value
def n_smps(self):
return self.n_obs
def smp(self):
return self.obs
def smp(self, value):
self.obs = value
def add(self):
return self.uns
def add(self, value):
self.uns = value
def smpm(self):
return self.obsm
def smpm(self, value):
self.obsm = value
def smp_names(self):
return self.obs_names
def smp_names(self, names):
self.obs_names = names
def smp_keys(self):
return self.obs_keys()
def smpm_keys(self):
return self.obsm_keys()
def _clean_up_old_format(self, uns):
# multicolumn keys
# all of the rest is only for backwards compat
if uns and '_obs_keys_multicol' in uns:
_keys_multicol_obs = list(uns['_obs_keys_multicol'])
del uns['_obs_keys_multicol']
elif uns and 'obs_keys_multicol' in uns:
_keys_multicol_obs = list(uns['obs_keys_multicol'])
del uns['obs_keys_multicol']
elif uns and '_smp_keys_multicol' in uns:
_keys_multicol_obs = list(uns['_smp_keys_multicol'])
del uns['_smp_keys_multicol']
elif uns and 'smp_keys_multicol' in uns:
_keys_multicol_obs = list(uns['smp_keys_multicol'])
del uns['smp_keys_multicol']
_keys_multicol_obs = []
if uns and '_var_keys_multicol' in uns:
_keys_multicol_var = list(uns['_var_keys_multicol'])
del uns['_var_keys_multicol']
elif uns and 'var_keys_multicol' in uns:
_keys_multicol_var = list(uns['var_keys_multicol'])
del uns['var_keys_multicol']
_keys_multicol_var = []
# now, for compat, fill the old multicolumn entries into obsm and varm
# and remove them from obs and var
for key in _keys_multicol_obs:
self._obsm[key] = self._get_multicol_field_obs(key)
for key in _keys_multicol_var:
self._varm[key] = self._get_multicol_field_var(key)
def _get_multicol_field_obs(self, key_multicol):
return self._get_and_delete_multicol_field('obs', key_multicol)
def _get_multicol_field_var(self, key_multicol):
return self._get_and_delete_multicol_field('var', key_multicol)
def _get_and_delete_multicol_field(self, a, key_multicol):
keys = []
for k in getattr(self, a).columns:
if k.startswith(key_multicol):
values = getattr(self, a)[keys].values
getattr(self, a).drop(keys, axis=1, inplace=True)
return values
You can’t perform that action at this time.