# <b>Python for Data Analysis</b>
# 14. Appendix A. Advanced NumPy

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)
from pandas import Series, DataFrame
%matplotlib inline

In [2]:
rng = np.random.default_rng(seed = 12345)

## A.1 ndarray Object Internals

In [3]:
np.ones((10, 5)).strides

(40, 8)

In [4]:
np.ones((10, 5)).strides

(40, 8)

In [5]:
np.ones((3, 4, 5), dtype = np.float64).strides

(160, 40, 8)

### NumPy Data Type Hierarchy

In [6]:
ints = np.ones(10, dtype = np.uint16)
floats = np.ones(10, dtype = np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

In [7]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [8]:
np.issubdtype(ints.dtype, np.number)

True

## A.2 Advanced Array Manipulation

### Reshaping Arrays

In [9]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [11]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [12]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [13]:
arrTest = np.arange(324)
arrTest.reshape((9, -1))

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
         49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
         62,  63,  64,  65,  66,  67,  68,  69,  70,  71],
       [ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107],
       [108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142, 143],
       [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 1

In [14]:
other_arr = np.ones((3, 5))
other_arr.shape

(3, 5)

In [15]:
other_arr

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [16]:
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [17]:
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [18]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [19]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus FORTRAN Order

In [20]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [21]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [22]:
arr.ravel("F")

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [23]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

In [24]:
np.concatenate([arr1, arr2], axis = 0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [25]:
np.concatenate([arr1, arr2], axis = 1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [26]:
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [27]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [28]:
arr = rng.standard_normal((5, 2))
arr

array([[-1.4238,  1.2637],
       [-0.8707, -0.2592],
       [-0.0753, -0.7409],
       [-1.3678,  0.6489],
       [ 0.3611, -1.9529]])

In [29]:
first, second, third = np.split(arr, [1, 3])

In [30]:
first

array([[-1.4238,  1.2637]])

In [31]:
second

array([[-0.8707, -0.2592],
       [-0.0753, -0.7409]])

In [32]:
third

array([[-1.3678,  0.6489],
       [ 0.3611, -1.9529]])

#### Stacking helpers: r_ and c_

In [33]:
arr = np.arange(6)
arr

array([0, 1, 2, 3, 4, 5])

In [34]:
arr1 = arr.reshape((3, 2))
arr2 = rng.standard_normal((3, 2))

In [35]:
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [36]:
arr2

array([[ 2.3474,  0.9685],
       [-0.7594,  0.9022],
       [-0.467 , -0.0607]])

In [37]:
np.r_[arr1, arr2]

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 2.3474,  0.9685],
       [-0.7594,  0.9022],
       [-0.467 , -0.0607]])

In [38]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 2.3474,  0.9685,  3.    ],
       [-0.7594,  0.9022,  4.    ],
       [-0.467 , -0.0607,  5.    ]])

In [39]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: <code>tile</code> and <code>repeat</code>

In [40]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [41]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [42]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [43]:
arr = rng.standard_normal((2, 2))
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [44]:
arr.repeat(2, axis = 0)

array([[ 0.7888, -1.2567],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ]])

In [45]:
arr.repeat([2, 3], axis = 0)

array([[ 0.7888, -1.2567],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ]])

In [46]:
arr.repeat([2, 3], axis = 1)

array([[ 0.7888,  0.7888, -1.2567, -1.2567, -1.2567],
       [ 0.5759,  0.5759,  1.399 ,  1.399 ,  1.399 ]])

In [47]:
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [48]:
np.tile(arr, 2)

array([[ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ]])

In [49]:
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [50]:
np.tile(arr, (2, 1))

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [51]:
np.tile(arr, (3, 2))

array([[ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ],
       [ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ],
       [ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ]])

### Fancy Indexing Equivalents: <code>take</code> and <code>put</code>

In [52]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [53]:
inds = [7, 1, 2, 6]
inds

[7, 1, 2, 6]

In [54]:
arr[inds]

array([700, 100, 200, 600])

In [55]:
arr.take(inds)

array([700, 100, 200, 600])

In [56]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [57]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [58]:
inds = [2, 0, 2, 1]

In [59]:
arr = rng.standard_normal((2, 4))
arr

array([[ 1.3223, -0.2997,  0.9029, -1.6216],
       [-0.1582,  0.4495, -1.3436, -0.0817]])

In [60]:
arr.take(inds, axis = 1)

array([[ 0.9029,  1.3223,  0.9029, -0.2997],
       [-1.3436, -0.1582, -1.3436,  0.4495]])

## A.3 Broadcasting

In [61]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [62]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [63]:
arr = rng.standard_normal((4, 3))
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [64]:
arr.mean(0)

array([0.1206, 0.243 , 0.1444])

In [65]:
demeaned = arr - arr.mean(0)
demeaned

array([[ 1.6042,  2.3751,  0.633 ],
       [ 0.7081, -1.202 , -1.3538],
       [-1.5329,  0.2985,  0.6076],
       [-0.7793, -1.4717,  0.1132]])

In [66]:
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [67]:
row_means = arr.mean(1)
row_means

array([ 1.7068, -0.4466, -0.0396, -0.5433])

In [68]:
row_means.shape

(4,)

In [69]:
row_means.reshape((4, 1))

array([[ 1.7068],
       [-0.4466],
       [-0.0396],
       [-0.5433]])

In [70]:
demeaned = arr - row_means.reshape((4, 1))

In [71]:
demeaned.mean(1)

array([-0.,  0.,  0.,  0.])

### Broadcasting over Other Axes

In [72]:
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [73]:
arr.mean(1)

array([ 1.7068, -0.4466, -0.0396, -0.5433])

In [74]:
# Will return an error:
# arr - arr.mean(1)

In [75]:
arr - arr.mean(1).reshape((4, 1))

array([[ 0.018 ,  0.9114, -0.9294],
       [ 1.2752, -0.5124, -0.7628],
       [-1.3727,  0.5811,  0.7915],
       [-0.1155, -0.6854,  0.8009]])

In [76]:
arr = np.zeros((4, 4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [77]:
arr_3d = arr[:, np.newaxis, :]

In [78]:
arr_3d.shape

(4, 1, 4)

In [79]:
arr_1d = rng.standard_normal(3)
arr_1d

array([ 0.3129, -0.1308,  1.27  ])

In [80]:
arr_1d[:, np.newaxis]

array([[ 0.3129],
       [-0.1308],
       [ 1.27  ]])

In [81]:
arr_1d.shape

(3,)

In [82]:
arr_1d[np.newaxis, :]

array([[ 0.3129, -0.1308,  1.27  ]])

In [83]:
arr = rng.standard_normal((3, 4, 5))

In [84]:
arr

array([[[-0.093 , -0.0662, -1.1082,  0.136 ,  1.3471],
        [ 0.0611,  0.0709,  0.4337,  0.2775,  0.5303],
        [ 0.5367,  0.6184, -0.795 ,  0.3   , -1.6027],
        [ 0.2668, -1.2616, -0.0713,  0.474 , -0.4149]],

       [[ 0.0977, -1.6404, -0.8573,  0.6883, -1.1545],
        [ 0.6505, -1.3884, -0.9074, -1.0954,  0.0071],
        [ 0.5344, -1.0658, -0.1815,  1.622 , -0.3174],
        [-0.8158,  0.3866, -0.2236, -0.7017, -1.7957]],

       [[ 0.8183, -0.571 ,  0.0008, -1.0636,  1.3017],
        [ 0.7479,  0.9809, -0.1104,  0.4679,  0.8906],
        [ 1.023 ,  0.3124, -0.0619, -0.3595, -0.7486],
        [-0.9655,  0.36  , -0.2446, -1.9959, -0.1552]]])

In [85]:
depth_means = arr.mean(2)
depth_means

array([[ 0.0431,  0.2747, -0.1885, -0.2014],
       [-0.5732, -0.5467,  0.1183, -0.6301],
       [ 0.0972,  0.5954,  0.0331, -0.6002]])

In [86]:
depth_means.shape

(3, 4)

In [87]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[ 0., -0.,  0., -0.],
       [ 0., -0., -0., -0.],
       [ 0.,  0.,  0.,  0.]])

In [88]:
def demean_axis(arr, axis = 0):
    means = arr.mean(axis)
    
    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting Array Values by Broadcasting

In [89]:
arr = np.zeros((4, 3))
arr

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [90]:
arr[:] = 5

In [91]:
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [92]:
col = np.array([1.28, -0.42, 0.44, 1.6])
col

array([ 1.28, -0.42,  0.44,  1.6 ])

In [93]:
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

## A.4 Advanced ufunc Usage

### ufunc Instance Methods

In [94]:
arr = np.arange(10)

In [95]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [96]:
np.add.reduce(arr)

45

In [97]:
arr.sum()

45

In [98]:
my_rng = np.random.default_rng(12346)

In [99]:
arr = my_rng.standard_normal((5, 5))
arr

array([[-0.9039,  0.1571,  0.8976, -0.7622, -0.1763],
       [ 0.053 , -1.6284, -0.1775,  1.9636,  1.7813],
       [-0.8797, -1.6985, -1.8189,  0.119 , -0.4441],
       [ 0.7691, -0.0343,  0.3925,  0.7589, -0.0705],
       [ 1.0498,  1.0297, -0.4201,  0.7863,  0.9612]])

In [100]:
arr[::2].sort(1)
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True,  True, False],
       [ True,  True,  True,  True],
       [False,  True,  True, False],
       [ True,  True,  True,  True]])

In [101]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis = 1)

array([ True, False,  True, False,  True])

In [102]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [103]:
np.add.accumulate(arr, axis = 1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

In [104]:
arr.cumsum(axis = 1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

In [105]:
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

In [106]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [107]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [108]:
x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)

In [109]:
x

array([[ 1.0638, -0.2752, -1.8533, -0.1243],
       [ 0.785 ,  0.202 , -0.4281,  1.8483],
       [ 1.9   , -0.0984,  0.8134,  0.3925]])

In [110]:
y

array([ 0.7814,  1.4533,  0.8202,  0.0877, -0.6535])

In [111]:
result = np.subtract.outer(x, y)
result

array([[[ 0.2824, -0.3894,  0.2436,  0.9761,  1.7173],
        [-1.0566, -1.7284, -1.0954, -0.3629,  0.3783],
        [-2.6348, -3.3066, -2.6735, -1.941 , -1.1998],
        [-0.9058, -1.5776, -0.9445, -0.212 ,  0.5292]],

       [[ 0.0035, -0.6683, -0.0352,  0.6973,  1.4385],
        [-0.5794, -1.2513, -0.6182,  0.1143,  0.8555],
        [-1.2095, -1.8813, -1.2483, -0.5158,  0.2254],
        [ 1.0668,  0.395 ,  1.0281,  1.7606,  2.5018]],

       [[ 1.1185,  0.4467,  1.0798,  1.8122,  2.5535],
        [-0.8799, -1.5517, -0.9186, -0.1861,  0.5551],
        [ 0.032 , -0.6398, -0.0067,  0.7257,  1.467 ],
        [-0.3889, -1.0608, -0.4277,  0.3048,  1.046 ]]])

In [112]:
result.shape

(3, 4, 5)

In [113]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [114]:
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17], dtype=int32)

In [115]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

In [116]:
np.add.reduceat(arr, [0, 2, 4], axis = 1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

### Writing New ufuncs in Python

In [117]:
def add_elements(x, y):
    return x + y

In [118]:
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [119]:
add_them = np.vectorize(add_elements, otypes = [np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [120]:
arr = rng.standard_normal(10_000)

In [121]:
%timeit add_them(arr, arr)

4.66 ms ± 790 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [122]:
%timeit np.add(arr, arr)

7.19 µs ± 979 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## A.5 Structured and Record Arrays

In [123]:
dtype = [('x', np.float64), ('y', np.int32)]

In [124]:
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype = dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

In [125]:
sarr[0]

(1.5, 6)

In [126]:
sarr[0]['y']

6

In [127]:
sarr['x']

array([1.5   , 3.1416])

### Nested Data Types and Multidimensional Fields

In [128]:
dtype = [('x', np.int64, 3), ('y', np.int32)]

In [129]:
arr = np.zeros(4, dtype = dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [130]:
arr[0]['x']

array([0, 0, 0], dtype=int64)

In [131]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)

In [132]:
np.zeros(4)

array([0., 0., 0., 0.])

In [133]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]

In [134]:
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype = dtype)
data

array([((1., 2.), 5), ((3., 4.), 6)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [135]:
data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

In [136]:
data['y']

array([5, 6])

In [137]:
data['x']['a']

array([1., 3.])

## A.6 More About Sorting

In [138]:
arr = rng.standard_normal(6)
arr.sort()
arr

array([-1.1553, -0.9319, -0.5218, -0.4745, -0.1649,  0.03  ])

In [139]:
arr = rng.standard_normal((3, 5))
arr

array([[-1.1956,  0.4691, -0.3598,  1.0359,  0.2267],
       [-0.7448, -0.5931, -1.055 , -0.0683,  0.458 ],
       [-0.07  ,  0.1462, -0.9944,  1.1436,  0.5026]])

In [140]:
arr[:, 0].sort()
arr

array([[-1.1956,  0.4691, -0.3598,  1.0359,  0.2267],
       [-0.7448, -0.5931, -1.055 , -0.0683,  0.458 ],
       [-0.07  ,  0.1462, -0.9944,  1.1436,  0.5026]])

In [142]:
arr = rng.standard_normal(5)
arr

array([ 0.8981, -1.1704, -0.2686, -0.796 ,  1.4522])

In [143]:
np.sort(arr)

array([-1.1704, -0.796 , -0.2686,  0.8981,  1.4522])

In [144]:
arr

array([ 0.8981, -1.1704, -0.2686, -0.796 ,  1.4522])

In [145]:
arr = rng.standard_normal((3, 5))
arr

array([[-0.2535,  2.1183,  0.3634, -0.6245,  1.1279],
       [ 1.6164, -0.2287, -0.6201, -0.1143, -1.2067],
       [-1.0872, -2.1518, -0.6287, -1.3199,  0.083 ]])

In [146]:
arr.sort(axis = 1)
arr

array([[-0.6245, -0.2535,  0.3634,  1.1279,  2.1183],
       [-1.2067, -0.6201, -0.2287, -0.1143,  1.6164],
       [-2.1518, -1.3199, -1.0872, -0.6287,  0.083 ]])

In [147]:
arr[:, ::-1]

array([[ 2.1183,  1.1279,  0.3634, -0.2535, -0.6245],
       [ 1.6164, -0.1143, -0.2287, -0.6201, -1.2067],
       [ 0.083 , -0.6287, -1.0872, -1.3199, -2.1518]])

### Indirect Sorts: <code>argsort</code> and <code>lexsort</code>

In [148]:
values = np.array([5, 0, 1, 3, 2])
values

array([5, 0, 1, 3, 2])

In [149]:
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0], dtype=int64)

In [150]:
values[indexer]

array([0, 1, 2, 3, 5])

In [151]:
arr = rng.standard_normal((3, 5))
arr

array([[ 0.2017,  0.9173,  0.1764, -0.2934,  1.1621],
       [-0.7503, -2.1268, -1.391 , -0.4922,  0.4505],
       [ 0.8926, -1.0479,  0.9553,  0.2936,  0.5379]])

In [152]:
arr[0] = values
arr

array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [-0.7503, -2.1268, -1.391 , -0.4922,  0.4505],
       [ 0.8926, -1.0479,  0.9553,  0.2936,  0.5379]])

In [153]:
arr[:, arr[0].argsort()]

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-2.1268, -1.391 ,  0.4505, -0.4922, -0.7503],
       [-1.0479,  0.9553,  0.5379,  0.2936,  0.8926]])

In [154]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))

In [157]:
dict_test = {last_name[i]: first_name[i] for i in range(len(last_name))}
dict_test

{'Jones': 'Bill', 'Arnold': 'Steve', 'Walters': 'Barbara'}

In [158]:
sorter

array([1, 2, 3, 0, 4], dtype=int64)

In [159]:
list(zip(last_name[sorter], first_name[sorter]))

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### Alternative Sort Algorithms

In [161]:
values = np.array(["2:first", "2:second", "1:first", "1:second",
                   "1:third"])

In [162]:
key = np.array([2, 2, 1, 1, 1])

In [163]:
indexer = key.argsort(kind = 'mergesort')
indexer

array([2, 3, 4, 0, 1], dtype=int64)

In [164]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

### Partially Sorting Arrays

In [165]:
rng = np.random.default_rng(12345)
arr = rng.standard_normal(20)
arr

array([-1.4238,  1.2637, -0.8707, -0.2592, -0.0753, -0.7409, -1.3678,
        0.6489,  0.3611, -1.9529,  2.3474,  0.9685, -0.7594,  0.9022,
       -0.467 , -0.0607,  0.7888, -1.2567,  0.5759,  1.399 ])

In [166]:
np.partition(arr, 3)

array([-1.9529, -1.4238, -1.3678, -1.2567, -0.8707, -0.7594, -0.7409,
       -0.0607,  0.3611, -0.0753, -0.2592, -0.467 ,  0.5759,  0.9022,
        0.9685,  0.6489,  0.7888,  1.2637,  1.399 ,  2.3474])

In [167]:
np.sort(arr)

array([-1.9529, -1.4238, -1.3678, -1.2567, -0.8707, -0.7594, -0.7409,
       -0.467 , -0.2592, -0.0753, -0.0607,  0.3611,  0.5759,  0.6489,
        0.7888,  0.9022,  0.9685,  1.2637,  1.399 ,  2.3474])

In [168]:
indices = np.argpartition(arr, 3)
indices

array([ 9,  0,  6, 17,  2, 12,  5, 15,  8,  4,  3, 14, 18, 13, 11,  7, 16,
        1, 19, 10], dtype=int64)

In [169]:
arr.take(indices)

array([-1.9529, -1.4238, -1.3678, -1.2567, -0.8707, -0.7594, -0.7409,
       -0.0607,  0.3611, -0.0753, -0.2592, -0.467 ,  0.5759,  0.9022,
        0.9685,  0.6489,  0.7888,  1.2637,  1.399 ,  2.3474])

In [171]:
test1 = np.array([3, 5, 2, 4, 1, 6, 2])
test1

array([3, 5, 2, 4, 1, 6, 2])

In [172]:
np.sort(test1)

array([1, 2, 2, 3, 4, 5, 6])

In [176]:
np.partition(test1, 2)

array([1, 2, 2, 4, 3, 6, 5])

In [178]:
%timeit np.sort(test1)

2.19 µs ± 299 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [179]:
%timeit np.partition(test1, 3)

2.86 µs ± 387 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [180]:
%timeit np.partition(arr, 3)

2.99 µs ± 282 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [181]:
%timeit np.sort(arr)

2.31 µs ± 377 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [183]:
rng = np.random.default_rng(12345)
arr_test = rng.standard_normal(10_000_000)

In [184]:
%timeit np.partition(arr_test, 3)

79.8 ms ± 3.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [185]:
%timeit np.sort(arr_test)

1.11 s ± 71.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [186]:
np.sort(test1)[:3]

array([1, 2, 2])

In [187]:
np.sort(arr_test)[:3]

array([-5.5952, -5.3395, -5.0341])

In [189]:
np.partition(arr_test, 3)[:3]

array([-5.5952, -5.3395, -5.0341])

### <code>numpy.searchsorted</code>: Finding Elements in a Sorted Array

In [190]:
arr = np.array([0, 1, 7, 12, 15])

In [191]:
arr.searchsorted(9)

3

In [192]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5], dtype=int64)

In [193]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])

In [194]:
arr.searchsorted([0, 1])

array([0, 3], dtype=int64)

In [195]:
arr.searchsorted([0, 1], side = "right")

array([3, 7], dtype=int64)

In [196]:
data = np.floor(rng.uniform(0, 10_000, size = 50))
bins = np.array([0, 100, 1_000, 5_000, 10_000])
data

array([3318., 6314., 2080., 8312., 3140., 7257., 5351., 8031., 3489.,
       2781., 8983.,  357., 6559., 3169., 2242., 1301., 1697., 9621.,
       8242., 1432., 9708., 4437., 2010.,   48., 1881., 4003., 5212.,
       6598., 1492., 1221., 6802., 9585., 3447., 5210., 3722.,  561.,
       2819., 5352., 6480., 1443., 1719., 3721., 9834., 4318., 7376.,
       5745., 4201.,  971., 7434., 6236.])

In [197]:
labels = bins.searchsorted(data)
labels

array([3, 4, 3, 4, 3, 4, 4, 4, 3, 3, 4, 2, 4, 3, 3, 3, 3, 4, 4, 3, 4, 3,
       3, 1, 3, 3, 4, 4, 3, 3, 4, 4, 3, 4, 3, 2, 3, 4, 4, 3, 3, 3, 4, 3,
       4, 4, 3, 2, 4, 4], dtype=int64)

In [198]:
pd.Series(data).groupby(labels).mean()

1      48.000000
2     629.666667
3    2711.791667
4    7283.727273
dtype: float64

## A.7 Writing Fast NumPy Functions with Numba

In [199]:
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [200]:
x = rng.standard_normal(10_000_000)
y = rng.standard_normal(10_000_000)

In [201]:
%timeit mean_distance(x, y)

5.6 s ± 251 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [202]:
%timeit (x - y).mean()

58.6 ms ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [203]:
import numba as nb

In [204]:
numba_mean_distance = nb.jit(mean_distance)

In [205]:
@nb.jit
def numba_mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [206]:
%timeit numba_mean_distance(x, y)

15.1 ms ± 3.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [207]:
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()

In [208]:
%timeit mean_distance(x, y)

52.3 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Creating Custom numpy.ufunc Objects with Numba

In [217]:
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y

In [218]:
x = np.arange(10)

In [219]:
nb_add(x, x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18], dtype=int64)

In [228]:
# nb_add.accumulate(x, 0)

## A.8 Advanced Array Input and Output

### Memory-Mapped Files

In [230]:
mmap = np.memmap('mymap', dtype = 'float64', mode = 'w+',
                shape = (10_000, 10_000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [231]:
section = mmap[:5]

In [232]:
section[:] = rng.standard_normal((5, 10_000))
mmap.flush()
mmap

memmap([[-0.7846, -0.9227,  1.0363, ..., -1.9276,  0.8512,  1.103 ],
        [ 0.5787, -1.6586,  0.7683, ..., -0.3577,  0.0716,  0.1095],
        [ 0.0228, -0.9197,  0.4387, ..., -1.0965, -0.5849,  0.9752],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [233]:
del mmap

In [237]:
# mmap = np.memmap('mymmap', dtype = 'float64', shape = (10_000, 10_000))
# mmap

In [238]:
%xdel mmap

NameError: name 'mmap' is not defined


In [239]:
!rm mymmap

"rm" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


## A.9 Performance Tips

### The Importance of Contiguous Memory

In [240]:
arr_c = np.ones((100, 10_000), order = "C")
arr_f = np.ones((100, 10_000), order = "F")
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [241]:
arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [242]:
%timeit arr_c.sum(1)

1.81 ms ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [243]:
%timeit arr_f.sum(1)

738 µs ± 52.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [244]:
arr_f.copy("C").flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [245]:
arr_c[:50].flags.contiguous

True

In [246]:
arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False