In [1]:
#HDF5 carray

In [1]:
%matplotlib inline
import math
import random

import numpy as np
import tables
import matplotlib.pyplot as plt


In [2]:
fname = '../raw/total-3L.h5'
store = tables.open_file(fname, 'r')

In [3]:
genotype = store.get_node('/3L/calldata/genotype')

In [4]:
genotype.read()

MemoryError: 

In [5]:
shape = genotype.shape
shape

(9643193, 765, 2)

In [6]:
genotype_np = genotype.read(stop=shape[1] * shape[2])
np_shape = genotype_np.shape
np_shape

(1530, 765, 2)

In [7]:
side = int(math.sqrt(np_shape[0] * np_shape[1] * np_shape[2] ))
genotype_np.shape = side, side
genotype_np.shape

(1530, 1530)

In [10]:
%timeit -n100000 -r3 genotype_np[random.randrange(side),:].sum()

100000 loops, best of 3: 7.64 µs per loop


In [11]:
%timeit -n100000 -r3 genotype_np[:, random.randrange(side)].sum()

100000 loops, best of 3: 13.6 µs per loop


In [28]:
%time sum_by_col(genotype_np)

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 25.7 ms


In [None]:
def mult_native(mat):
    nrows, ncols = mat.shape
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                mult[row, col] += mat[row, i] * mat[i, col]
    return mult

In [None]:
%time mult_native(genotype_np)

In [36]:
def mult_partial_vector(mat):
    nrows, ncols = mat.shape
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    for row in range(nrows):
        for col in range(ncols):
            mat[row, col] += np.sum(mat[row] * mat[:, col])
    return mult

In [37]:
%time mult_partial_vector(genotype_np)

CPU times: user 47 s, sys: 4 ms, total: 47 s
Wall time: 47 s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
%time np.dot(genotype_np, genotype_np)

CPU times: user 4.87 s, sys: 0 ns, total: 4.87 s
Wall time: 4.87 s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

## Cython

In [66]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


### first knack

In [101]:
%%cython
cimport numpy as np
import numpy as np

def cython_mult_0(mat):
    nrows, ncols = mat.shape
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                mult[row, col] += mat[row, i] * mat[i, col]
    return mult

In [102]:
%time cython_mult_0(genotype_np)

CPU times: user 52min 24s, sys: 11.4 s, total: 52min 35s
Wall time: 52min 20s


  if __name__ == '__main__':


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

### second

In [99]:
%%cython
cimport numpy as np
import numpy as np

def cython_mult_1(mat):
    cdef int nrows = mat.shape[0]
    cdef int ncols = mat.shape[1]
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                value = mat[row, i] * mat[i, col]
                mult[row, col] += value
    return mult

In [100]:
%time cython_mult_1(genotype_np)

CPU times: user 56min 38s, sys: 12 s, total: 56min 50s
Wall time: 56min 32s


  if __name__ == '__main__':


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

### third

In [97]:
%%cython
cimport numpy as np
import numpy as np

def cython_mult_2(np.ndarray[np.int8_t, ndim=2] mat):
    cdef int nrows = mat.shape[0]
    cdef int ncols = mat.shape[1]
    cdef np.ndarray[np.int8_t, ndim=2] mult
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    cdef np.int8_t value
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                value = mat[row, i] * mat[i, col]
                mult[row, col] += value
    return mult

In [98]:
%time cython_mult_2(genotype_np)

CPU times: user 12.3 s, sys: 72 ms, total: 12.4 s
Wall time: 12.3 s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

### last

In [92]:
%%cython
cimport cython
cimport numpy as np
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False) #Annotate later
def cython_mult(np.ndarray[np.int8_t, ndim=2] mat):
    cdef int nrows = mat.shape[0]
    cdef int ncols = mat.shape[1]
    cdef np.ndarray[np.int8_t, ndim=2] mult
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    cdef np.int8_t value
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                value = mat[row, i] * mat[i, col]
                mult[row, col] += value
    return mult

In [94]:
%time cython_mult(genotype_np)

int8
CPU times: user 11.4 s, sys: 32 ms, total: 11.4 s
Wall time: 11.4 s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int8)

## Numba

In [None]:
import numba

In [None]:
@numba.jit
def numba_mult(mat):
    nrows, ncols = mat.shape
    mult = np.zeros((nrows, ncols), dtype=np.int8)
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                mult[row, col] += mat[row, i] * mat[i, col]
    return mult