# NumPy tutorial

Oliver W. Layton

CS251: Data Analysis and Visualization

Spring 2024

In [1]:
import numpy as np
import time

## Numpy ndarray basics

### Creation from Python lists

In [6]:
# Make a numpy array from a 2D python list
pyList = [1, 2, 3]
arr = np.array(pyList)
print(arr)

[1 2 3]


In [5]:
# print it
np.random.seed(0)
arr = np.random.random([3, 5])
print(arr)
print(arr[1, :])

[[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.64589411 0.43758721 0.891773   0.96366276 0.38344152]
 [0.79172504 0.52889492 0.56804456 0.92559664 0.07103606]]
[0.64589411 0.43758721 0.891773   0.96366276 0.38344152]


### Data type of ndarray

In [8]:
# determine data type
print('Type of array is\n', arr.dtype)

Type of array is
 int64


Type can be changed in a few ways. 

1. when creating array — (a) implicitly or (b) explicitly
2. by casting types.

In [10]:
# 1a implicitly
arr = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


In [11]:
# 1b explicitly
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=float)
print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


In [None]:
# 2. NOTE: This is a METHOD of the array, not a FUNCTION
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])


In [12]:
# Can also be string. be careful in your CSV parser that your "numbers"
# aren't actually strings!
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr = arr.astype(float) # change the type of an array

print(arr)
print(arr.dtype)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
float64


### Convert back to Python list

In [13]:
# Convert back from ndarray to Python list

arrAsList = arr.tolist()

print('Back as a Python list:\n', arrAsList)

Back as a Python list:
 [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]


### Other ways to create ndarrays quickly

#### 1. zeros

- We can plug in a list to get a multi-dimensional array
- We can plug in one int to get a vector of values

In [15]:
arr = np.zeros([2,10])
print(arr)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [17]:
arr = np.zeros(5)
print(arr)

[0. 0. 0. 0. 0.]


#### 2. ones

In [18]:
arr = np.ones([10, 2])
print(arr)

[[1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]]


In [None]:
# can easily make any constant array


#### 3. Random values

In [20]:
# Uniform random values
randArr = np.random.random([5, 8])
# randArr = np.random.random(size=)
print(randArr)

[[0.57019677 0.43860151 0.98837384 0.10204481 0.20887676 0.16130952
  0.65310833 0.2532916 ]
 [0.46631077 0.24442559 0.15896958 0.11037514 0.65632959 0.13818295
  0.19658236 0.36872517]
 [0.82099323 0.09710128 0.83794491 0.09609841 0.97645947 0.4686512
  0.97676109 0.60484552]
 [0.73926358 0.03918779 0.28280696 0.12019656 0.2961402  0.11872772
  0.31798318 0.41426299]
 [0.0641475  0.69247212 0.56660145 0.26538949 0.52324805 0.09394051
  0.5759465  0.9292962 ]]


#### 4. Equally spaced floats in an interval

In [25]:
arr = np.linspace(0, 10)
print(arr)
arr = np.linspace(0, 10, 5)
print(arr)

[ 0.          0.20408163  0.40816327  0.6122449   0.81632653  1.02040816
  1.2244898   1.42857143  1.63265306  1.83673469  2.04081633  2.24489796
  2.44897959  2.65306122  2.85714286  3.06122449  3.26530612  3.46938776
  3.67346939  3.87755102  4.08163265  4.28571429  4.48979592  4.69387755
  4.89795918  5.10204082  5.30612245  5.51020408  5.71428571  5.91836735
  6.12244898  6.32653061  6.53061224  6.73469388  6.93877551  7.14285714
  7.34693878  7.55102041  7.75510204  7.95918367  8.16326531  8.36734694
  8.57142857  8.7755102   8.97959184  9.18367347  9.3877551   9.59183673
  9.79591837 10.        ]
[ 0.   2.5  5.   7.5 10. ]


#### 5. Equally spaced ints in an interval

In [28]:
my_ints = np.arange(11)
print(my_ints)

[ 0  1  2  3  4  5  6  7  8  9 10]


#### 6. Identity matrix

In [None]:
np.eye(10)

### Check dimensions — `shape`

In [None]:
# check shape of 3D array
arr = np.zeros([3, 4, 5])
arr

In [None]:
# check number of dimensions (M)


In [None]:
# Access 1st dim (#rows), 2nd dim (#cols) (Use f-string)


In [None]:
# Check number of elements total
print('Num elements in arr_1:', )

In [None]:
# the familar Python len function works on ndarrays (any number of dims).
# It returns the shape (length) of the 1st dimension

## ndarray indexing

Basic Accessing and modifying of ndarrays.

### Access and modify single elements

In [None]:
# To access elements in a multidimensional ndarray use ONE set of square brackets []
# Make a new random array
np.random.seed(0)  # ensures random numbers come up the same each time. Useful for debugging.


In [None]:
# Get the 1st element


In [None]:
# Modifying single values is similar

print('arr is now:\n', arr)

### Slicing: real power of numpy

Use **colon** notation for all values in a dimension

Access and modify different ranges of data along different dimensions 

Make a 3x5 random array. Access 2nd column

In [43]:
np.random.seed(0)
arr = np.random.random([4, 5])
print(arr)
print(arr[:, [0, 3]])

[[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
 [0.64589411 0.43758721 0.891773   0.96366276 0.38344152]
 [0.79172504 0.52889492 0.56804456 0.92559664 0.07103606]
 [0.0871293  0.0202184  0.83261985 0.77815675 0.87001215]]
[[0.5488135  0.54488318]
 [0.64589411 0.96366276]
 [0.79172504 0.92559664]
 [0.0871293  0.77815675]]


Access 1st row

Access last 2 columns

Access columns at indices 1-2 and in 1st row. Careful about off-by-one.

- Low range (before :) CONTAINS that index
- High range (after :) DOES NOT contain that index (i-1)

Use slicing to assign values efficiently in batch without loops

In [None]:
# Assign 1st row to -1s


In [None]:
# Assign 1st row to increasing ints


In [None]:
# Multiply the 3rd row by 5 times itself and update the row


### What if we want to access a select set of rows in multiple columns?

Example: Say we want the first and last row and and columns 1, 3, and 5. If we try this with slicing, it won't work!

Can't use regular indexing notation. Instead use `np.ix_`

In [None]:
# This doesn't work!
# arr[[0, -1], [0, 2, 4]]

**Syntax for `np.ix_`:**
- `np.ix_` goes inside the square brackets: `arr[np.ix_(blah)]`
- Give it `M` arguments (e.g. 2 for a 2D matrix).
- Each argument is a Python list (or ndarray) of indices to take along that dimension.

array([[-1.        , -1.        , -1.        ],
       [ 3.95862519,  2.84022281,  0.35518029]])

## Memory

- Numpy tries to be efficient with arrays so assignment does a shallow copy. To do a deep copy, you need to use `.copy()` method

In [None]:
a = np.linspace(-1, 1, 5)
a

In [None]:

print(b)

In [None]:
# changed a!
a

In [None]:
# fixed with .copy()


## Apply functions over dimensions (`axes`)

- Axes are the numpy term for different ndarray dimensions. 
- *Idea*: Do we want to apply an operation (e.g. sum) on the rows OR columns of a ndarray?
- *Example*: axis 0 are the rows, axis 1 are the columns, etc.
- We can apply functions over one or more axis super efficiently in one line of code! This is called **Vectorization** — MUCH MUCH faster than loops (stay tuned).

In [None]:
one = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]])
one

Sum along rows -> "collapse" across rows to get sum within each column — 3 numbers

In [None]:
np.random.seed(0)
arr = np.random.random([5, 6, 7])



Sum along columns -> "collapse" across columns to get sum within each row — 4 numbers

**Careful:** Applying a function without specifying the axis may compute across the ENTIRE ndarray.

**Mnemonic trick:** Applying a function along an axis eliminates that dimension from the shape. Left with remaining dimensions.

In [None]:
print(one.shape)

print(f'Mean across axis 0: {}')

print(one.shape)

print(f'Mean across axis 1: {}')

## Broadcasting (basics)

**This is the most useful numpy feature thus far! This will become your bread-and-butter!**

We will cover the basics now and revisit broadcasting in more detail in a few weeks.

### Broadcasting scalars

As we saw, we can create an array of any size with any constant value WITHOUT ANY LOOPS. This is the simplest example of numpy **broadcasting** the scalar across the ndarray.

In [None]:
# Example with basic arithmetic


### Applying an operation to corresponding values in two arrays that have the same shape

Broadcasting allows you to efficiently (*and in one line of code*) add, subtract, multiply, and perform other operations on corresponding values in two arrays.

#### Examples: subtracting arrays with several different shapes

In [None]:
# 1D arrays
arr1 = np.arange(10)
arr2 = 5*np.ones(10)
print(f'arr1: {arr1}')
print(f'arr2: {arr2}')
print(f'Shape of arr1: {arr1.shape}')
print(f'Shape of arr2: {arr2.shape}')

print(arr3)
print(f'Shape of arr3: {arr3.shape}')

In [None]:
# 2D arrays
arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2 = np.array([[1, 1, 1], [0, 0, 0], [2, 2, 2]])
print(f'arr1:\n{arr1}')
print(f'arr2:\n{arr2}')
print(f'Shape of arr1: {arr1.shape}')
print(f'Shape of arr2: {arr2.shape}')

print(arr3)
print(f'Shape of arr3: {arr3.shape}')

### Singleton dimensions

Sometimes you have a 1D array that has shape like `(blah,)` but you need to use broadcasting to subtract it with another array that has a shape of a **column vector** of a matrix `(blah, 1)`. Because the number of values in the two arrays match, you would think it would be possible to broadcast operations to corresponding values (e.g. subtract). BUT, broadcasting won't work like you want (let's try).

In [None]:
arr1 = np.arange(3)
arr2 = np.array([[1], [2], [3]])
print(f'arr1:\n{arr1}')
print(f'arr2:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')
print(f'arr2 shape: {arr2.shape}')
print('Trying to broadcast arr1 - arr2...')


To get broadcasting to subtract off corresponding values in the arrays, we need to add a **singleton dimension** — an extra 1 dimension to `arr1` so that the shape matches the other array and so that numpy interprets `arr1` also as a column vector.

In [None]:
arr1 = np.arange(3)

print(f'arr1:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')
print(f'arr2 shape: {arr2.shape}')
print('Trying to broadcast arr1 - arr2...')
arr3 = arr1 - arr2
arr3

### Squeeze: How to get of all singleton dimensions

"Undo" a new axis / singleton dimension

In [None]:

print(f'arr1:\n{arr2}')
print(f'arr1 shape: {arr1.shape}')

Removes ALL singleton dimensions (if you have more than one):

## Vectorization speed vs loops

Time computation of summing a ndarray with loop vs vectorized.

In [None]:
def timeit(fun):
    '''Just a function to time the runtime of another function'''
    def timer():
        start = time.time()
        fun()
        end = time.time()
        print(f'Took {end - start:.3} secs to run.')
    return timer


@timeit
def sumLoop():
    '''Use for loop to sum a row vector'''
    longRow = np.array([i for i in range(1, 1000000)])
    theSum = 0
    for i in range(len(longRow)):
        theSum += longRow[i]


@timeit
def sumVectorized():
    '''Vectorized version of summing a row vector'''
    longRow = np.array([i for i in range(1, 1000000)])
    theSum = np.sum(longRow)

In [None]:
# Dynamic typing in python makes for loops with lots of small
# operations slow
print('sumLoop:')
sumLoop()

# Vectorization allows Numpy to stop searching at runtime
# and use efficient pre-compiled functions to batch-process
# the computation over the matrix
print('sumVectorized:')
sumVectorized()

## Combining multiple ndarrays

**Problem:**
- You have two ndarrays and want to concatenate them
- You have an ndarray and **want to append a column or row vector**

### Add/append a new column — "stack horizontally"

**Mnemonic**: Columns go horizontally.

Have `a`:

    [[1, 2]
     [3, 4]]
and `b`

    [[9]
     [9]]
    
want to make:

    [[1, 2, 9]
     [3, 4, 9]]
    
i.e. stack horizontally. Could be two matrices (not just a matrix and a vector).

**Caveat:** We need to make sure shapes are compatible for broadcasting:

- Result shape = `(2, 3)`
- We are starting with `a` shape: `(2, 2)`

The shape of `b` needs to be `(2, 1)` (why?)

## Sorting ndarrays

Numpy allows you to quickly sort numbers or strings.

In [None]:
# Let's sort this array of random values between 0 and 1.
randArr = np.random.random(size=(100,))


In [None]:
# Let's sort this list of words that start with different letters from 'a' to 'z'
word_list = ['leopard', 'apple', 'flamingo', 'giraffe', 'iguana', 'banana', 'yak', 'rhinoceros', 'zebra', 'cherry', \
             'jaguar', 'walrus', 'x-ray', 'hippo', 'narwhal', 'quokka', 'vulture', 'panda', 'squirrel', 'monkey', \
             'umbrella', 'ostrich', 'elephant', 'tiger', 'dog', 'kangaroo']


To reverse the order of an ndarray, you can use Python-like negative indexing notation

## Finding the index of the maximum item in an ndarray

This will come up at times during the semester.

In [None]:
arr = np.array([1, 3, 7, 5])
print(f'The INDEX of the max item in arr is {}.')

The INDEX of the max item in arr is 2.
