remove filtering, change copy to inplace

scverse · Mar 8, 2019 · d9f2367 · d9f2367
1 parent d223281
commit d9f2367
Showing 1 changed file with 69 additions and 62 deletions.
diff --git a/scanpy/preprocessing/normalization.py b/scanpy/preprocessing/normalization.py
@@ -3,20 +3,24 @@
 from sklearn.utils import sparsefuncs
 from .. import logging as logg
 
-def _normalize_data(X, counts, after=None, copy=False):
+def _normalize_data(X, counts, after=None, cell_subset=None, copy=False):
     X = X.copy() if copy else X
-    after = np.median(counts) if after is None else after
+    if after is None:
+        after = np.median(counts[cell_subset]) if cell_subset is not None else np.median(counts)
+    if cell_subset is None:
+        counts /= after
+    else:
+        counts[np.logical_not(cell_subset)] = 1
+        counts[cell_subset] = counts[cell_subset]/after
     counts += (counts == 0)
-    counts /= after
     if issparse(X):
         X = sparsefuncs.inplace_row_scale(X, 1/counts)
     else:
         X /= counts[:, None]
     return X if copy else None
 
-def normalize_quantile(data, counts_per_cell_after=None, counts_per_cell=None,
-                       quantile=1, min_counts=1, key_n_counts=None, copy=False,
-                       layers=[], use_rep=None):
+def normalize_quantile(data, cell_sum_after=None, quantile=1, min_counts=1, key_n_counts=None,
+                       inplace=True, layers=[], layer_norm=None):
     """Normalize total counts per cell.
 
     Normalize each cell by total counts over genes, so that every cell has
@@ -30,31 +34,30 @@ def normalize_quantile(data, counts_per_cell_after=None, counts_per_cell=None,
     data : :class:`~anndata.AnnData`
         The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
         to cells and columns to genes.
-    counts_per_cell_after : `float` or `None`, optional (default: `None`)
+    cell_sum_after : `float` or `None`, optional (default: `None`)
         If `None`, after normalization, each cell has a total count equal
         to the median of the *counts_per_cell* before normalization.
-    counts_per_cell : `np.array`, optional (default: `None`)
-        Precomputed counts per cell.
     quantile : `float`, optional (default: 1)
-        Only use genes are less than fraction (specified by *quantile*) of the total
-        reads in every cell.
+        Only use genes are less than fraction (specified by *quantile*)
+        of the total reads in every cell.
     min_counts : `int`, optional (default: 1)
         Cells with counts less than `min_counts` are filtered out during
         normalization.
-    key_n_counts : `str`, optional (default: `'n_counts'`)
+    key_n_counts : `str`, optional (default: `None`)
         Name of the field in `adata.obs` where the total counts per cell are
         stored.
-    copy : `bool`, optional (default: `False`)
-        If an :class:`~anndata.AnnData` is passed, determines whether a copy
-        is returned.
+    inplace : `bool`, optional (default: `True`)
+        Whether to change data.X and data.layers or just return
+        dictionary with normalized copies of data.X and data.layers.
     layers : `str` or list of `str`, optional (default: `[]`)
         List of layers to normalize. Set to `'all'` to normalize all layers.
-    use_rep : `str` or `None`, optional (default: `None`)
+    layer_norm : `str` or `None`, optional (default: `None`)
         Specifies how to normalize layers.
-        If `None`, after normalization, for each layer in *layers* each cell has a total count equal
-        to the median of the *counts_per_cell* before normalization of the layer.
-        If `'after'`, for each layer in *layers* each cell has a total count equal
-        to counts_per_cell_after.
+        If `None`, after normalization, for each layer in *layers* each cell
+        has a total count equal to the median of the *counts_per_cell* before
+        normalization of the layer.
+        If `'after'`, for each layer in *layers* each cell has
+        a total count equal to cell_sum_after.
         If `'X'`, for each layer in *layers* each cell has a total count equal
         to the median of the *counts_per_cell* of data.X before normalization.
 
@@ -75,63 +78,68 @@ def normalize_quantile(data, counts_per_cell_after=None, counts_per_cell=None,
     if quantile < 0 or quantile > 1:
         raise ValueError('Choose quantile between 0 and 1.')
 
-    if key_n_counts is None: key_n_counts = 'n_counts'
-
-    adata = data.copy() if copy else data
-    X = adata.X
+    X = data.X
     gene_subset = None
+    if not inplace:
+    # not recarray because need to support sparse
+        dat = {}
 
     if quantile < 1:
         logg.msg('normalizing by count per cell for \
                   genes that make up less than quantile * total count per cell', r=True)
-        X = adata.X
+        X = data.X
 
-        counts_per_cell = counts_per_cell if counts_per_cell is not None else X.sum(1)
+        counts_per_cell = X.sum(1)
         counts_per_cell = np.ravel(counts_per_cell)
 
         gene_subset = (X>counts_per_cell[:, None]*quantile).sum(0)
         gene_subset = (np.ravel(gene_subset) == 0)
     else:
         logg.msg('normalizing by total count per cell', r=True)
 
-    if counts_per_cell is None or quantile < 1:
-        X = X if gene_subset is None else adata[:, gene_subset].X
-        counts_per_cell = X.sum(1)
-        counts_per_cell = np.ravel(counts_per_cell)
+    X = X if gene_subset is None else data[:, gene_subset].X
+    counts_per_cell = X.sum(1)
+    #get rid of data view
+    counts_per_cell = np.ravel(counts_per_cell).copy()
     del X
     del gene_subset
 
-    adata.obs[key_n_counts] = counts_per_cell
+    if key_n_counts is not None:
+        adata.obs[key_n_counts] = counts_per_cell
     cell_subset = counts_per_cell >= min_counts
-    adata._inplace_subset_obs(cell_subset)
-    counts_per_cell = counts_per_cell[cell_subset]
-
-    if use_rep == 'after':
-        after = counts_per_cell_after
-    elif use_rep == 'X':
-        after = np.median(counts_per_cell)
-    elif use_rep is None:
+
+    if layer_norm == 'after':
+        after = cell_sum_after
+    elif layer_norm == 'X':
+        after = np.median(counts_per_cell[cell_subset])
+    elif layer_norm is None:
         after = None
-    else: raise ValueError('use_rep should be "after", "X" or None')
+    else: raise ValueError('layer_norm should be "after", "X" or None')
 
-    _normalize_data(adata.X, counts_per_cell, counts_per_cell_after)
+    if inplace:
+        _normalize_data(data.X, counts_per_cell, cell_sum_after, cell_subset)
+    else:
+        dat['X'] = _normalize_data(data.X, counts_per_cell, cell_sum_after, cell_subset, True)
 
-    layers = adata.layers.keys() if layers == 'all' else layers
+    layers = data.layers.keys() if layers == 'all' else layers
     for layer in layers:
-        L = adata.layers[layer]
+        L = data.layers[layer]
         counts = np.ravel(L.sum(1))
-        _normalize_data(L, counts, after)
+        if inplace:
+            _normalize_data(L, counts, after)
+        else:
+            dat[layer] = _normalize_data(L, counts, after, copy=True)
 
     logg.msg('    finished', t=True, end=': ')
-    logg.msg('normalized adata.X and added', no_indent=True)
-    logg.msg('    \'{}\', counts per cell before normalization (adata.obs)'
-        .format(key_n_counts))
+    logg.msg('normalized adata.X')
+    if key_n_counts is not None:
+        logg.msg('and added \'{}\', counts per cell before normalization (adata.obs)'
+            .format(key_n_counts))
 
-    return adata if copy else None
+    return dat if not inplace else None
 
-def normalize_total(data, counts_per_cell_after=None, counts_per_cell=None,
-                    key_n_counts=None, copy=False, layers=[], use_rep=None,
-                    min_counts=1):
+def normalize_total(data, cell_sum_after=None, counts_per_cell=None, key_n_counts=None,
+                    inplace=True, layers=[], layer_norm=None, min_counts=1):
     """Normalize total counts per cell.
 
     Normalize each cell by total counts over all genes, so that every cell has
@@ -145,7 +153,7 @@ def normalize_total(data, counts_per_cell_after=None, counts_per_cell=None,
     data : :class:`~anndata.AnnData`
         The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
         to cells and columns to genes.
-    counts_per_cell_after : `float` or `None`, optional (default: `None`)
+    cell_sum_after : `float` or `None`, optional (default: `None`)
         If `None`, after normalization, each cell has a total count equal
         to the median of the *counts_per_cell* before normalization.
     counts_per_cell : `np.array`, optional (default: `None`)
@@ -158,12 +166,12 @@ def normalize_total(data, counts_per_cell_after=None, counts_per_cell=None,
         is returned.
     layers : `str` or list of `str`, optional (default: `[]`)
         List of layers to normalize. Set to `'all'` to normalize all layers.
-    use_rep : `str` or `None`, optional (default: `None`)
+    layer_norm : `str` or `None`, optional (default: `None`)
         Specifies how to normalize layers.
         If `None`, after normalization, for each layer in *layers* each cell has a total count equal
         to the median of the *counts_per_cell* before normalization of the layer.
         If `'after'`, for each layer in *layers* each cell has a total count equal
-        to counts_per_cell_after.
+        to cell_sum_after.
         If `'X'`, for each layer in *layers* each cell has a total count equal
         to the median of the *counts_per_cell* of data.X before normalization.
     min_counts : `int`, optional (default: 1)
@@ -177,19 +185,19 @@ def normalize_total(data, counts_per_cell_after=None, counts_per_cell=None,
 
     Examples
     --------
-    >>> adata = AnnData(data=np.array([[1, 0], [3, 0], [5, 6]]))
+    >>> adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]]))
     >>> print(adata.X.sum(axis=1))
     [  1.   3.  11.]
-    >>> sc.pp.normalize_per_cell(adata)
+    >>> sc.pp.normalize_total(adata)
     >>> print(adata.obs)
     >>> print(adata.X.sum(axis=1))
        n_counts
     0       1.0
     1       3.0
     2      11.0
     [ 3.  3.  3.]
-    >>> sc.pp.normalize_per_cell(adata, counts_per_cell_after=1,
-    >>>                          key_n_counts='n_counts2')
+    >>> sc.pp.normalize_total(adata, cell_sum_after=1,
+    >>>                       key_n_counts='n_counts2')
     >>> print(adata.obs)
     >>> print(adata.X.sum(axis=1))
        n_counts  n_counts2
@@ -198,7 +206,6 @@ def normalize_total(data, counts_per_cell_after=None, counts_per_cell=None,
     2      11.0        3.0
     [ 1.  1.  1.]
     """
-    return normalize_quantile(data=data, counts_per_cell_after=counts_per_cell_after,
-                              counts_per_cell=counts_per_cell, key_n_counts=key_n_counts,
-                              copy=copy, layers=layers, use_rep=use_rep, min_counts=min_counts,
-                              quantile=1)
+    return normalize_quantile(data=data, cell_sum_after=cell_sum_after,
+                              key_n_counts=key_n_counts, inplace=inplace, layers=layers,
+                              layer_norm=layer_norm, min_counts=min_counts, quantile=1)