# Numpy tutorial

Oliver W. Layton

CS251: Data Analysis and Visualization

Fall 2023

In [1]:
import numpy as np
import time

## Numpy ndarray basics

### Creation from Python lists

In [2]:
# Make a numpy array from a 2D python list
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [3]:
# print it
print(arr)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


### Data type of ndarray

In [4]:
# determine data type
print('Type of array is\n', arr.dtype)

Type of array is
 int64


Type can be changed in a few ways. 

1. when creating array — (a) implicitly or (b) explicitly
2. by casting types.

In [5]:
# 1a implicitly
arr = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


In [6]:
# 1b explicitly
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=float)
print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


In [7]:
# 2. NOTE: This is a METHOD of the array, not a FUNCTION
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])


In [8]:
# Can also be string. be careful in your CSV parser that your "numbers"
# aren't actually strings!
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr = arr.astype(float)
print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


### Convert back to Python list

In [9]:
# Convert back from ndarray to Python list
arrAsList = arr.tolist()
print('Back as a Python list:\n', arrAsList)

Back as a Python list:
 [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]


### Other ways to create ndarrays quickly

#### 1. zeros

- We can plug in a list to get a multi-dimensional array
- We can plug in one int to get a vector of values

In [10]:
z = np.zeros([7, 5])
z

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [11]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

#### 2. ones

In [12]:
oneArr = np.ones([4, 5])
oneArr

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [13]:
# can easily make any constant array
sixes = 0.5*np.ones([4, 5])
sixes

array([[0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5]])

#### 3. Random values

In [14]:
# Uniform random values
np.random.random(size=(2, 3))

array([[0.5862783 , 0.76325538, 0.38843382],
       [0.31814178, 0.10885274, 0.83228734]])

#### 4. Equally spaced floats in an interval

In [18]:
np.linspace(-5, 5) # 50 values by default
np.linspace(-5, 5, 12)

array([-5.        , -4.09090909, -3.18181818, -2.27272727, -1.36363636,
       -0.45454545,  0.45454545,  1.36363636,  2.27272727,  3.18181818,
        4.09090909,  5.        ])

#### 5. Equally spaced ints in an interval

In [22]:
# np.arange(15)
# np.arange(-10, 15)
np.arange(-10, 15, 2)

array([-10,  -8,  -6,  -4,  -2,   0,   2,   4,   6,   8,  10,  12,  14])

#### 6. Identity matrix

In [None]:
np.eye(10)

### Check dimensions — `shape`

In [23]:
# check shape of 3D array
one = np.ones([3, 4, 5])
one.shape

(3, 4, 5)

In [24]:
# check number of dimensions (M)
one.ndim

3

In [25]:
# Access 1st dim (#rows), 2nd dim (#cols) (Use f-string)
another = np.zeros([5, 6])
print(f'The number of rows are {another.shape[0]} and the number of cols is {another.shape[1]}')

The number of rows are 5 and the number of cols is 6


In [26]:
# Check number of elements total
print('Num elements in arr_1:', another.size)

Num elements in arr_1: 30


## ndarray indexing

Basic Accessing and modifying of ndarrays.

### Access and modify single elements

In [27]:
# To access elements in a multidimensional ndarray use ONE set of square brackets []
# Make a new random array
np.random.seed(0)  # ensures random numbers come up the same each time. Useful for debugging.
randArr = np.random.random([3, 5])
randArr


array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [28]:
# Get the 1st element
randArr[0, 0]

0.5488135039273248

In [29]:
# Modifying single values is similar
randArr[0, 0] = 9
print('arr is now:\n', randArr)

arr is now:
 [[9.         0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.64589411 0.43758721 0.891773   0.96366276 0.38344152]
 [0.79172504 0.52889492 0.56804456 0.92559664 0.07103606]]


### Slicing: real power of numpy

Use **colon** notation for all values in a dimension

Access and modify different ranges of data along different dimensions 

Make a 3x5 random array. Access 2nd column

In [31]:
np.random.seed(0)
randArr = np.random.random([3, 5])
randArr

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [32]:
randArr[:, 1]

array([0.71518937, 0.43758721, 0.52889492])

Access 1st row

In [36]:
randArr[0, :]
# randArr[0] # works too

array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ])

Access last 2 columns

In [38]:
randArr

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [39]:
# randArr[:, -2:]
randArr[:, 3:]

array([[0.54488318, 0.4236548 ],
       [0.96366276, 0.38344152],
       [0.92559664, 0.07103606]])

Access columns at indices 1-2 and in 1st row. Careful about off-by-one.

- Low range (before :) CONTAINS that index
- High range (after :) DOES NOT contain that index (i-1)

In [40]:
randArr

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [41]:
randArr[0, 1:3]

array([0.71518937, 0.60276338])

Use slicing to assign values efficiently in batch without loops

In [42]:
randArr

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [45]:
# Assign 1st row to -1s
randArr[0, :] = -1
randArr

array([[-1.        , -1.        , -1.        , -1.        , -1.        ],
       [ 0.64589411,  0.43758721,  0.891773  ,  0.96366276,  0.38344152],
       [ 0.79172504,  0.52889492,  0.56804456,  0.92559664,  0.07103606]])

In [46]:
randArr.shape

(3, 5)

In [51]:
# Assign 1st row to increasing ints
randArr[0, :] = np.arange(randArr.shape[1])
randArr

array([[0.        , 1.        , 2.        , 3.        , 4.        ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606]])

In [52]:
# Multiply the 3rd row by 5 times itself and update the row
randArr[2, :] = 5*randArr[2, :]
randArr

array([[0.        , 1.        , 2.        , 3.        , 4.        ],
       [0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [3.95862519, 2.6444746 , 2.84022281, 4.62798319, 0.35518029]])

### What if we want to access a set of rows or columns that are not adjacent?

Example: Say we want the first and the last column and rows 0, 2, 3.

Can't use colon notation. Instead use `np.ix_`

In [66]:
newArray = np.random.random([4, 5])
newArray[[0, 2, 3], [0, -1]]

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (3,) (2,) 



**Syntax for `np.ix_`:**
- `np.ix_` goes inside the square brackets: `arr[np.ix_(blah)]`
- Give it `M` arguments (e.g. 2 for a 2D matrix).
- Each argument is a Python list (or ndarray) of indices to take along that dimension.

In [68]:
newArray

array([[0.26538949, 0.52324805, 0.09394051, 0.5759465 , 0.9292962 ],
       [0.31856895, 0.66741038, 0.13179786, 0.7163272 , 0.28940609],
       [0.18319136, 0.58651293, 0.02010755, 0.82894003, 0.00469548],
       [0.67781654, 0.27000797, 0.73519402, 0.96218855, 0.24875314]])

In [67]:
# newArray[[0, 2, 3], [0, -1]]
newArray[np.ix_([0, 2, 3], [0, -1])]

array([[0.26538949, 0.9292962 ],
       [0.18319136, 0.00469548],
       [0.67781654, 0.24875314]])

In [69]:
np.arange(newArray.shape[1])

array([0, 1, 2, 3, 4])

In [70]:
newArray[np.ix_([0, 2, 3], np.arange(newArray.shape[1]))]

array([[0.26538949, 0.52324805, 0.09394051, 0.5759465 , 0.9292962 ],
       [0.18319136, 0.58651293, 0.02010755, 0.82894003, 0.00469548],
       [0.67781654, 0.27000797, 0.73519402, 0.96218855, 0.24875314]])

## Memory

- Numpy tries to be efficient with arrays so assignment does a shallow copy. To do a deep copy, you need to use `.copy()` method

In [75]:
a = np.linspace(-1, 1, 5)
a

array([-1. , -0.5,  0. ,  0.5,  1. ])

In [76]:
b = a
print(b)

[-1.  -0.5  0.   0.5  1. ]


In [77]:
b[0] = 99
# changed a!
a

array([99. , -0.5,  0. ,  0.5,  1. ])

In [78]:
# fixed with .copy()
a = np.linspace(-1, 1, 5)
b = a.copy()
b[0] = 99
print(a)
print(b)


[-1.  -0.5  0.   0.5  1. ]
[99.  -0.5  0.   0.5  1. ]


## Apply functions over dimensions (`axes`)

- Axes are the numpy term for different ndarray dimensions. 
- *Idea*: Do we want to apply an operation (e.g. sum) on the rows OR columns of a ndarray?
- *Example*: axis 0 are the rows, axis 1 are the columns, etc.
- We can apply functions over one or more axis super efficiently in one line of code! This is called **Vectorization** — MUCH MUCH faster than loops (stay tuned).

In [79]:
one = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]])
one

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])

Sum along rows -> "collapse" across rows to get sum within each column — 3 numbers

In [80]:
np.sum(one, axis=0)

array([10, 10, 10])

Sum along columns -> "collapse" across columns to get sum within each row — 4 numbers

In [81]:
np.sum(one, axis=1)

array([ 3,  6,  9, 12])

**Careful:** Applying a function without specifying the axis may compute across the ENTIRE ndarray.

In [82]:
np.sum(one)

30

**Mnemonic trick:** Applying a function along an axis eliminates that dimension from the shape. Left with remaining dimensions.

In [84]:
print(one.shape)

print(f'Mean across axis 0: {np.mean(one, axis=0).shape}')

print(one.shape)

print(f'Mean across axis 1: {np.mean(one, axis=1).shape}')

(4, 3)
Mean across axis 0: (3,)
(4, 3)
Mean across axis 1: (4,)


## Broadcasting (basics)

**This is the most useful numpy feature thus far! This will become your bread-and-butter!**

We will cover the basics now and revisit broadcasting in more detail in a few weeks.

### Broadcasting scalars

As we saw, we can create an array of any size with any constant value WITHOUT ANY LOOPS. This is the simplest example of numpy **broadcasting** the scalar across the ndarray.

In [87]:
# Example with basic arithmetic
5*np.ones([5, 6])/2.5 + 10.3

array([[12.3, 12.3, 12.3, 12.3, 12.3, 12.3],
       [12.3, 12.3, 12.3, 12.3, 12.3, 12.3],
       [12.3, 12.3, 12.3, 12.3, 12.3, 12.3],
       [12.3, 12.3, 12.3, 12.3, 12.3, 12.3],
       [12.3, 12.3, 12.3, 12.3, 12.3, 12.3]])

### Applying an operation to corresponding values in two arrays that have the same shape

Broadcasting allows you to efficiently (*and in one line of code*) add, subtract, multiply, and perform other operations on corresponding values in two arrays.

#### Examples: subtracting arrays with several different shapes

In [None]:
# 1D arrays
arr1 = np.arange(10)
arr2 = 5*np.ones(10)
print(f'arr1: {arr1}')
print(f'arr2: {arr2}')
print(f'Shape of arr1: {arr1.shape}')
print(f'Shape of arr2: {arr2.shape}')

print(arr3)
print(f'Shape of arr3: {arr3.shape}')

In [None]:
# 2D arrays
arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2 = np.array([[1, 1, 1], [0, 0, 0], [2, 2, 2]])
print(f'arr1:\n{arr1}')
print(f'arr2:\n{arr2}')
print(f'Shape of arr1: {arr1.shape}')
print(f'Shape of arr2: {arr2.shape}')

print(arr3)
print(f'Shape of arr3: {arr3.shape}')

### Singleton dimensions

Sometimes you have a 1D array that has shape like `(blah,)` but you need to use broadcasting to subtract it with another array that has a shape of a **column vector** of a matrix `(blah, 1)`. Because the number of values in the two arrays match, you would think it would be possible to broadcast operations to corresponding values (e.g. subtract). BUT, broadcasting won't work like you want (let's try).

In [None]:
arr1 = np.arange(3)
arr2 = np.array([[1], [2], [3]])
print(f'arr1:\n{arr1}')
print(f'arr2:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')
print(f'arr2 shape: {arr2.shape}')
print('Trying to broadcast arr1 - arr2...')


To get broadcasting to subtract off corresponding values in the arrays, we need to add a **singleton dimension** — an extra 1 dimension to `arr1` so that the shape matches the other array and so that numpy interprets `arr1` also as a column vector.

In [None]:
arr1 = np.arange(3)

print(f'arr1:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')
print(f'arr2 shape: {arr2.shape}')
print('Trying to broadcast arr1 - arr2...')
arr3 = arr1 - arr2
arr3

### Squeeze: How to get of all singleton dimensions

"Undo" a new axis / singleton dimension

In [None]:

print(f'arr1:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')

Removes ALL singleton dimensions (if you have more than one):

## Vectorization speed vs loops

Time computation of summing a ndarray with loop vs vectorized.

In [None]:
def timeit(fun):
    '''Just a function to time the runtime of another function'''
    def timer():
        start = time.time()
        fun()
        end = time.time()
        print(f'Took {end - start:.3} secs to run.')
    return timer


@timeit
def sumLoop():
    '''Use for loop to sum a row vector'''
    longRow = np.array([i for i in range(1, 1000000)])
    theSum = 0
    for i in range(len(longRow)):
        theSum += longRow[i]


@timeit
def sumVectorized():
    '''Vectorized version of summing a row vector'''
    longRow = np.array([i for i in range(1, 1000000)])
    theSum = np.sum(longRow)

In [None]:
# Dynamic typing in python makes for loops with lots of small
# operations slow
print('sumLoop:')
sumLoop()

# Vectorization allows Numpy to stop searching at runtime
# and use efficient pre-compiled functions to batch-process
# the computation over the matrix
print('sumVectorized:')
sumVectorized()

## Combining multiple ndarrays

**Problem:**
- You have two ndarrays and want to concatenate them
- You have an ndarray and **want to append a column or row vector**

### Add/append a new column — "stack horizontally"

**Mnemonic**: Columns go horizontally.

Have `a`:

    [[1, 2]
     [3, 4]]
and `b`

    [[9]
     [9]]
    
want to make:

    [[1, 2, 9]
     [3, 4, 9]]
    
i.e. stack horizontally. Could be two matrices (not just a matrix and a vector).

**Caveat:** We need to make sure shapes are compatible for broadcasting:

- Result shape = `(2, 3)`
- We are starting with `a` shape: `(2, 2)`

The shape of `b` needs to be `(2, 1)` (why?)

## Sorting ndarrays

Numpy allows you to quickly sort numbers or strings.

In [None]:
# Let's sort this array of random values between 0 and 1.
randArr = np.random.random(size=(100,))


In [None]:
# Let's sort this list of words that start with different letters from 'a' to 'z'
word_list = ['leopard', 'apple', 'flamingo', 'giraffe', 'iguana', 'banana', 'yak', 'rhinoceros', 'zebra', 'cherry', \
             'jaguar', 'walrus', 'x-ray', 'hippo', 'narwhal', 'quokka', 'vulture', 'panda', 'squirrel', 'monkey', \
             'umbrella', 'ostrich', 'elephant', 'tiger', 'dog', 'kangaroo']


To reverse the order of an ndarray, you can use Python-like negative indexing notation