In [96]:
import numpy as np
from scipy import sparse
import numba
import pandas as pd
import scanpy as sc

# install dask if available
try:
    import dask.array as da
except ImportError:
    da = None

In [98]:
help(sc.pp)

Help on package scanpy.preprocessing in scanpy:

NAME
    scanpy.preprocessing

PACKAGE CONTENTS
    _combat
    _deprecated (package)
    _distributed
    _docs
    _highly_variable_genes
    _normalization
    _pca
    _qc
    _recipes
    _simple
    _utils

FILE
    /opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/__init__.py




In [4]:
def toarray(matrix1d):
    return np.array(matrix1d).flatten()

setattr(np.matrix, 'toarray', toarray)

In [2]:
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
b = sparse.csr_matrix((data, indices, indptr), shape=(3, 3))

In [8]:
df = pd.DataFrame.sparse.from_spmatrix(b)

In [35]:
a = [1,2,3]
df[['haa', 'ad', 'ao']] = a,a,a
df['haa', 'ad', 'ao'] = a,a,a

In [91]:
def haha(df):
    df['haha'] = ['haha'] * df.shape[0]
    
haha(df)

In [56]:
df[[True, True, False]]

Unnamed: 0,0,1,2,haa,ad,ao,"(haa, ad, ao)"
0,1,0,2,1,2,3,"(1, 2, 3)"
1,0,0,3,1,2,3,"(1, 2, 3)"


In [54]:
df

Unnamed: 0,0,1,2,haa,ad,ao,"(haa, ad, ao)"
0,1,0,2,1,2,3,"(1, 2, 3)"
1,0,0,3,1,2,3,"(1, 2, 3)"
2,4,5,6,1,2,3,"(1, 2, 3)"


In [95]:
a = np.zeros(2)
a.std(ddof=1).isnull()

AttributeError: 'numpy.float64' object has no attribute 'isnull'

In [47]:
np.ones(10, dtype=bool)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [21]:
b.todense()

matrix([[1, 0, 2],
        [0, 0, 3],
        [4, 5, 6]])

In [49]:
b[[0,2], [1,2]]

matrix([[0, 6]])

In [88]:
path = './data/hg19'
pd.read_csv(f'{path}/genes.tsv', sep='\t', header=None, usecols=[1]).set_index(1)

MIR1302-10
FAM138A
OR4F5
RP11-34P13.7
RP11-34P13.8
...
AC145205.1
BAGE5
CU459201.1
AC002321.2
AC002321.1


In [75]:
row_names = np.array(['a', 'b', 'c'])

In [19]:
np.where(row_names == 'a')

(array([0]),)

In [15]:
df.loc[[True, True, False],[False, True, True]]

Unnamed: 0,1,2
0,0,2
1,0,3


In [6]:
b_copy = b.copy()
b_copy.data **= 2

In [63]:
b[:, [True, True, False]]

<3x2 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [60]:
b

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [7]:
np.square(b_copy.mean(0))

matrix([[ 32.11111111,  69.44444444, 266.77777778]])

In [8]:
np.array(b.mean(axis=0)).flatten() ** 2

array([ 2.77777778,  2.77777778, 13.44444444])

In [48]:
np.nan > 0.01

False

In [9]:
variance = b_copy.mean(axis=0).toarray() - b.mean(axis=0).toarray() ** 2

In [10]:
def my_get_mean_var(X, axis=0):
    if isinstance(X, sparse.spmatrix):  # same as sparse.issparse()
        mean, var = my_sparse_mean_variance_axis(X, axis=axis)
        var *= X.shape[axis] / (X.shape[axis] - 1)
    else:
        mean = np.mean(X, axis=axis)
        mean_sq = np.var(X, axis=axis, ddof=1)  # a little overhead (mean counted twice, but it's ok.)
    return mean, var
'''
In standard statistical practice, ddof=1 provides an unbiased estimator of the variance
of a hypothetical infinite population. ddof=0 provides a maximum likelihood estimate of
the variance for normally distributed variables.
'''


def my_sparse_mean_variance_axis(mtx: sparse.spmatrix, axis: int):
    if isinstance(mtx, sparse.csr_matrix):
        ax_minor = 1
        shape = mtx.shape
    elif isinstance(mtx, sparse.csc_matrix):
        ax_minor = 0
        shape = mtx.shape[::-1]
    else:
        raise ValueError('This function only works on sparse csr and csc matrices')
    if axis == ax_minor:
        print(1)
        return my_sparse_mean_var_major_axis(
            mtx.data, mtx.indices, mtx.indptr, *shape, np.float64
        )
    else:
        print(0)
        return my_sparse_mean_var_minor_axis(
            mtx.data, mtx.indices, *shape, np.float64
        )
    

def my_sparse_mean_var_major_axis(
    data,
    indices,
    indptr,
    major_len,
    minor_len,
    dtype
):
    means = np.zeros(major_len, dtype=dtype)
    variances = np.zeros_like(means, dtype=dtype)  # why use zeros_like?
    for ind, (startptr, endptr) in enumerate(zip(indptr[:-1], indptr[1:])):
        counts = endptr - startptr
        
        mean = sum(data[startptr:endptr])
        variance = sum((i-means[i]) ** 2 for i in data[startptr:endptr]) + mean ** 2 * (minor_len - counts)
        means[ind] = mean / minor_len
        variances[ind] = variance / minor_len
        
    return means, variances


def my_sparse_mean_var_minor_axis(
    data,
    indices,
    major_len,
    minor_len,
    dtype
):
    non_zero = indices.shape[0] # same as len(indices)?

    means = np.zeros(minor_len, dtype=dtype)
    variances = np.zeros_like(means, dtype=dtype)

    counts = np.zeros(minor_len, dtype=np.int64)
    
    for ind, num in zip(indices, data):
        means[ind] += num
    
    means /= major_len
    
    for ind, num in zip(indices, data):
        variance[ind] += (num - means[ind]) ** 2
        count[ind] += 1
    print(variances)
        
    variances += [mean ** 2 * (major_len - count) for mean, count in zip(means, counts)]
    variances /= major_len
    
    return means, variances

In [11]:
def get_mean_var(X, *, axis=0):
    if sparse.issparse(X):
        mean, var = sparse_mean_variance_axis(X, axis=axis)
    else:
        mean = np.mean(X, axis=axis, dtype=np.float64)
        mean_sq = np.multiply(X, X).mean(axis=axis, dtype=np.float64)
        var = mean_sq - mean ** 2
    # enforce R convention (unbiased estimator) for variance
    var *= X.shape[axis] / (X.shape[axis] - 1)
    return mean, var


def sparse_mean_variance_axis(mtx: sparse.spmatrix, axis: int) -> np.ndarray:
    """
    This code and internal functions are based on sklearns
    `sparsefuncs.mean_variance_axis`.
    Modifications:
    * allow deciding on the output type, which can increase accuracy when calculating the mean and variance of 32bit floats.
    * This doesn't currently implement support for null values, but could.
    * Uses numba not cython
    """
    assert axis in (0, 1)
    if isinstance(mtx, sparse.csr_matrix):
        ax_minor = 1
        shape = mtx.shape
    elif isinstance(mtx, sparse.csc_matrix):
        ax_minor = 0
        shape = mtx.shape[::-1]
    else:
        raise ValueError("This function only works on sparse csr and csc matrices")
    if axis == ax_minor:
        return sparse_mean_var_major_axis(
            mtx.data, mtx.indices, mtx.indptr, *shape, np.float64
        )
    else:
        return sparse_mean_var_minor_axis(mtx.data, mtx.indices, *shape, np.float64)


@numba.njit(cache=True)
def sparse_mean_var_minor_axis(
    data,
    indices,
    major_len,
    minor_len,
    dtype
) -> np.ndarray:
    """
    Computes mean and variance for a sparse matrix for the minor axis.
    Given arrays for a csr matrix, returns the means and variances for each
    column back.
    """
    non_zero = indices.shape[0]

    means = np.zeros(minor_len, dtype=dtype)
    variances = np.zeros_like(means, dtype=dtype)

    counts = np.zeros(minor_len, dtype=np.int64)

    for i in range(non_zero):
        col_ind = indices[i]
        means[col_ind] += data[i]

    for i in range(minor_len):
        means[i] /= major_len

    for i in range(non_zero):
        col_ind = indices[i]
        diff = data[i] - means[col_ind]
        variances[col_ind] += diff * diff
        counts[col_ind] += 1
    print(variances)
    for i in range(minor_len):
        variances[i] += (major_len - counts[i]) * means[i] ** 2
        variances[i] /= major_len

    return means, variances


@numba.njit(cache=True)
def sparse_mean_var_major_axis(
    data,
    indices,
    indptr,
    major_len,
    minor_len,
    dtype
) -> np.ndarray:
    """
    Computes mean and variance for a sparse array for the major axis.
    Given arrays for a csr matrix, returns the means and variances for each
    row back.
    """
    means = np.zeros(major_len, dtype=dtype)
    variances = np.zeros_like(means, dtype=dtype)

    for i in range(major_len):
        startptr = indptr[i]
        endptr = indptr[i + 1]
        counts = endptr - startptr

        for j in range(startptr, endptr):
            means[i] += data[j]
        means[i] /= minor_len

        for j in range(startptr, endptr):
            diff = data[j] - means[i]
            variances[i] += diff * diff

        variances[i] += (minor_len - counts) * means[i] ** 2
        variances[i] /= minor_len

    return means, variances

In [12]:
def materialize_as_ndarray(a):
    """Convert distributed arrays to ndarrays."""
    if type(a) in (list, tuple):
        if da is not None and any(isinstance(arr, da.Array) for arr in a):
            return da.compute(*a, sync=True)
        return tuple(np.asarray(arr) for arr in a)
    return np.asarray(a)

In [34]:
my_get_mean_var(b)

0
[ 5.88888889 11.11111111  8.66666667]


(array([1.66666667, 1.66666667, 3.66666667]),
 array([4.33333333, 8.33333333, 4.33333333]))

In [35]:
get_mean_var(b)

[ 5.88888889 11.11111111  8.66666667]


(array([1.66666667, 1.66666667, 3.66666667]),
 array([4.33333333, 8.33333333, 4.33333333]))

In [17]:
axis = 0
    (b_copy.mean(axis=0).toarray() - b.mean(axis=0).toarray() ** 2) * b.shape[axis] / (b.shape[axis] - 1)

array([4.33333333, 8.33333333, 4.33333333])

In [36]:
my_sparse_mean_variance_axis(b, axis=0)

0
[ 5.88888889 11.11111111  8.66666667]


(array([1.66666667, 1.66666667, 3.66666667]),
 array([2.88888889, 5.55555556, 2.88888889]))

In [37]:
sparse_mean_variance_axis(b, axis=0)

[ 5.88888889 11.11111111  8.66666667]


(array([1.66666667, 1.66666667, 3.66666667]),
 array([2.88888889, 5.55555556, 2.88888889]))

In [29]:
np.var(b.todense(), axis=0, ddof=1)

matrix([[4.33333333, 8.33333333, 4.33333333]])