# <b>Python for Data Analysis</b>
# 14. Appendix A. Advanced NumPy

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)
from pandas import Series, DataFrame
%matplotlib inline

In [2]:
rng = np.random.default_rng(seed = 12345)

## A.1 ndarray Object Internals

In [3]:
np.ones((10, 5)).strides

(40, 8)

In [4]:
np.ones((10, 5)).strides

(40, 8)

In [5]:
np.ones((3, 4, 5), dtype = np.float64).strides

(160, 40, 8)

### NumPy Data Type Hierarchy

In [6]:
ints = np.ones(10, dtype = np.uint16)
floats = np.ones(10, dtype = np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

In [7]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [8]:
np.issubdtype(ints.dtype, np.number)

True

## A.2 Advanced Array Manipulation

### Reshaping Arrays

In [9]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [11]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [12]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [13]:
arrTest = np.arange(324)
arrTest.reshape((9, -1))

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
         49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
         62,  63,  64,  65,  66,  67,  68,  69,  70,  71],
       [ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107],
       [108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142, 143],
       [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 1

In [14]:
other_arr = np.ones((3, 5))
other_arr.shape

(3, 5)

In [15]:
other_arr

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [16]:
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [17]:
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [18]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [19]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus FORTRAN Order

In [20]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [21]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [22]:
arr.ravel("F")

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [23]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

In [24]:
np.concatenate([arr1, arr2], axis = 0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [25]:
np.concatenate([arr1, arr2], axis = 1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [26]:
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [27]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [28]:
arr = rng.standard_normal((5, 2))
arr

array([[-1.4238,  1.2637],
       [-0.8707, -0.2592],
       [-0.0753, -0.7409],
       [-1.3678,  0.6489],
       [ 0.3611, -1.9529]])

In [32]:
first, second, third = np.split(arr, [1, 3])

In [33]:
first

array([[-1.4238,  1.2637]])

In [34]:
second

array([[-0.8707, -0.2592],
       [-0.0753, -0.7409]])

In [35]:
third

array([[-1.3678,  0.6489],
       [ 0.3611, -1.9529]])

#### Stacking helpers: r_ and c_

In [37]:
arr = np.arange(6)
arr

array([0, 1, 2, 3, 4, 5])

In [39]:
arr1 = arr.reshape((3, 2))
arr2 = rng.standard_normal((3, 2))

In [41]:
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [42]:
arr2

array([[ 2.3474,  0.9685],
       [-0.7594,  0.9022],
       [-0.467 , -0.0607]])

In [40]:
np.r_[arr1, arr2]

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 2.3474,  0.9685],
       [-0.7594,  0.9022],
       [-0.467 , -0.0607]])

In [43]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 2.3474,  0.9685,  3.    ],
       [-0.7594,  0.9022,  4.    ],
       [-0.467 , -0.0607,  5.    ]])

In [44]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: <code>tile</code> and <code>repeat</code>

In [45]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [46]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [50]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [51]:
arr = rng.standard_normal((2, 2))
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [52]:
arr.repeat(2, axis = 0)

array([[ 0.7888, -1.2567],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ]])

In [53]:
arr.repeat([2, 3], axis = 0)

array([[ 0.7888, -1.2567],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ],
       [ 0.5759,  1.399 ]])

In [54]:
arr.repeat([2, 3], axis = 1)

array([[ 0.7888,  0.7888, -1.2567, -1.2567, -1.2567],
       [ 0.5759,  0.5759,  1.399 ,  1.399 ,  1.399 ]])

In [55]:
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [56]:
np.tile(arr, 2)

array([[ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ]])

In [57]:
arr

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [58]:
np.tile(arr, (2, 1))

array([[ 0.7888, -1.2567],
       [ 0.5759,  1.399 ],
       [ 0.7888, -1.2567],
       [ 0.5759,  1.399 ]])

In [59]:
np.tile(arr, (3, 2))

array([[ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ],
       [ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ],
       [ 0.7888, -1.2567,  0.7888, -1.2567],
       [ 0.5759,  1.399 ,  0.5759,  1.399 ]])

### Fancy Indexing Equivalents: <code>take</code> and <code>put</code>

In [60]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [62]:
inds = [7, 1, 2, 6]
inds

[7, 1, 2, 6]

In [63]:
arr[inds]

array([700, 100, 200, 600])

In [64]:
arr.take(inds)

array([700, 100, 200, 600])

In [66]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [68]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [69]:
inds = [2, 0, 2, 1]

In [70]:
arr = rng.standard_normal((2, 4))
arr

array([[ 1.3223, -0.2997,  0.9029, -1.6216],
       [-0.1582,  0.4495, -1.3436, -0.0817]])

In [71]:
arr.take(inds, axis = 1)

array([[ 0.9029,  1.3223,  0.9029, -0.2997],
       [-1.3436, -0.1582, -1.3436,  0.4495]])

## A.3 Broadcasting

In [72]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [73]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [74]:
arr = rng.standard_normal((4, 3))
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [75]:
arr.mean(0)

array([0.1206, 0.243 , 0.1444])

In [77]:
demeaned = arr - arr.mean(0)
demeaned

array([[ 1.6042,  2.3751,  0.633 ],
       [ 0.7081, -1.202 , -1.3538],
       [-1.5329,  0.2985,  0.6076],
       [-0.7793, -1.4717,  0.1132]])

In [78]:
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [80]:
row_means = arr.mean(1)
row_means

array([ 1.7068, -0.4466, -0.0396, -0.5433])

In [81]:
row_means.shape

(4,)

In [82]:
row_means.reshape((4, 1))

array([[ 1.7068],
       [-0.4466],
       [-0.0396],
       [-0.5433]])

In [83]:
demeaned = arr - row_means.reshape((4, 1))

In [84]:
demeaned.mean(1)

array([-0.,  0.,  0.,  0.])

### Broadcasting over Other Axes

In [85]:
arr

array([[ 1.7247,  2.6182,  0.7774],
       [ 0.8286, -0.959 , -1.2094],
       [-1.4123,  0.5415,  0.7519],
       [-0.6588, -1.2287,  0.2576]])

In [86]:
arr.mean(1)

array([ 1.7068, -0.4466, -0.0396, -0.5433])

In [90]:
# Will return an error:
# arr - arr.mean(1)

In [92]:
arr - arr.mean(1).reshape((4, 1))

array([[ 0.018 ,  0.9114, -0.9294],
       [ 1.2752, -0.5124, -0.7628],
       [-1.3727,  0.5811,  0.7915],
       [-0.1155, -0.6854,  0.8009]])

In [94]:
arr = np.zeros((4, 4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [95]:
arr_3d = arr[:, np.newaxis, :]

In [96]:
arr_3d.shape

(4, 1, 4)

In [99]:
arr_1d = rng.standard_normal(3)
arr_1d

array([-0.093 , -0.0662, -1.1082])

In [100]:
arr_1d[:, np.newaxis]

array([[-0.093 ],
       [-0.0662],
       [-1.1082]])

In [101]:
arr_1d.shape

(3,)

In [102]:
arr_1d[np.newaxis, :]

array([[-0.093 , -0.0662, -1.1082]])

In [103]:
arr = rng.standard_normal((3, 4, 5))

In [105]:
arr

array([[[ 0.136 ,  1.3471,  0.0611,  0.0709,  0.4337],
        [ 0.2775,  0.5303,  0.5367,  0.6184, -0.795 ],
        [ 0.3   , -1.6027,  0.2668, -1.2616, -0.0713],
        [ 0.474 , -0.4149,  0.0977, -1.6404, -0.8573]],

       [[ 0.6883, -1.1545,  0.6505, -1.3884, -0.9074],
        [-1.0954,  0.0071,  0.5344, -1.0658, -0.1815],
        [ 1.622 , -0.3174, -0.8158,  0.3866, -0.2236],
        [-0.7017, -1.7957,  0.8183, -0.571 ,  0.0008]],

       [[-1.0636,  1.3017,  0.7479,  0.9809, -0.1104],
        [ 0.4679,  0.8906,  1.023 ,  0.3124, -0.0619],
        [-0.3595, -0.7486, -0.9655,  0.36  , -0.2446],
        [-1.9959, -0.1552,  1.0638, -0.2752, -1.8533]]])

In [104]:
depth_means = arr.mean(2)
depth_means

array([[ 0.4097,  0.2336, -0.4738, -0.4682],
       [-0.4223, -0.3602,  0.1303, -0.4499],
       [ 0.3713,  0.5264, -0.3916, -0.6432]])

In [106]:
depth_means.shape

(3, 4)

In [107]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[ 0., -0., -0., -0.],
       [-0.,  0.,  0., -0.],
       [ 0., -0., -0.,  0.]])

In [108]:
def demean_axis(arr, axis = 0):
    means = arr.mean(axis)
    
    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting Array Values by Broadcasting

In [109]:
arr = np.zeros((4, 3))
arr

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [110]:
arr[:] = 5

In [111]:
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [112]:
col = np.array([1.28, -0.42, 0.44, 1.6])
col

array([ 1.28, -0.42,  0.44,  1.6 ])

In [113]:
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

## A.4 Advanced ufunc Usage

### ufunc Instance Methods

In [114]:
arr = np.arange(10)

In [115]:
np.add.reduce(arr)

45

In [116]:
arr.sum()

45