# Array

Dask array provides a parallel, larger-than-memory, n-dimensional array using blocked algorithms. 

**DISTRIBUTED Numpy**.


![](images/dask_array.png)

In [1]:
import numpy as np
import dask.array as da

#x = da.random.normal(0, 1, size=(15, 10),   
#                              chunks=(5, 10))

x = da.random.normal(0, 1, size=(200000, 30),   # 600 million element array 
                              chunks=(10000, 30))   

y = x.mean(axis=0)
y

Unnamed: 0,Array,Chunk
Bytes,240 B,240 B
Shape,"(30,)","(30,)"
Count,48 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 240 B 240 B Shape (30,) (30,) Count 48 Tasks 1 Chunks Type float64 numpy.ndarray",30  1,

Unnamed: 0,Array,Chunk
Bytes,240 B,240 B
Shape,"(30,)","(30,)"
Count,48 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [2]:
# Local client
from dask.distributed import Client
n_workers = 10

# ON OUR SGE
def scale_to_sge(n_workers):
    queue="q_1day"
    queue_resource_spec="q_1day=TRUE"
    memory="4GB"
    sge_log= "./logs"
    from dask_jobqueue import SGECluster
    cluster = SGECluster(queue=queue, memory=memory, cores=1, processes=1,
              log_directory=sge_log,
              local_directory=sge_log,
              resource_spec=queue_resource_spec,
              project="citerus"
              )
    cluster.scale_up(n_workers)
    return Client(cluster)  # start local workers as threads


#### SWITH THIS IF YOU WANT TO RUN LOCALLY OR IN OUR SGE GRID ###

# Local client
#client = Client(n_workers=n_workers)

# SGE client
client = scale_to_sge(n_workers)



  from distributed.utils import tmpfile


In [3]:
%%time
y.compute(scheduler=client)

CPU times: user 886 ms, sys: 137 ms, total: 1.02 s
Wall time: 31.9 s


array([ 2.33540356e-04,  4.96083933e-03,  8.06376144e-04,  5.14930159e-03,
       -8.89413544e-04,  5.56987289e-03, -5.14368837e-03,  1.55397391e-04,
       -6.08657122e-05,  1.51943612e-03,  5.11754464e-03,  3.28856620e-04,
        2.36708089e-03,  6.46353900e-04, -6.78811541e-04,  7.84611776e-04,
       -4.19538118e-03,  2.05811237e-03,  1.67990270e-03,  3.00970616e-03,
       -1.56402536e-05, -2.30860267e-03,  8.39258455e-04, -9.99563672e-04,
        2.13249776e-03, -2.20975935e-03,  2.24765075e-03,  1.35787434e-03,
       -3.00034155e-03, -3.44463608e-03])

# Several linear algebra functions already implemented in parallel.

For instance, follow below one example with SVD

N. Halko, P. G. Martinsson, and J. A. Tropp. Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions. SIAM Rev., Survey and Review section, Vol. 53, num. 2, pp. 217-288, June 2011


In [4]:
%%time
u, s, v = da.linalg.svd_compressed(x, k=5)
u.compute(scheduler=client)

CPU times: user 764 ms, sys: 74.6 ms, total: 838 ms
Wall time: 10.1 s


array([[ 1.04177697e-03, -2.40558964e-03,  1.06736682e-03,
         1.04458946e-03, -1.29048870e-03],
       [-1.01316822e-03,  2.01564861e-03, -1.37524736e-03,
         2.11226029e-03, -8.50848466e-04],
       [ 5.51829730e-04,  1.74482021e-03, -3.39125324e-05,
        -4.57938207e-04,  6.55159557e-04],
       ...,
       [-6.47950600e-04, -1.23414421e-03,  2.57047747e-03,
         5.55124988e-03,  2.74121770e-03],
       [ 2.08634666e-03, -5.49839055e-04,  1.48122535e-03,
        -1.96453590e-03, -1.92897719e-03],
       [-2.85356778e-03,  5.02331035e-04, -1.99078897e-03,
         7.49319567e-04,  2.74342588e-03]])

# Always shutdown your client

In [5]:
client.shutdown()

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


# Most of numpy API is there

Check it out [dask array](https://docs.dask.org/en/latest/array.html)..