For additional information, see Python Data Science Handbook chapter 2

In [None]:
import numpy as np

## Concatenating Arrays

In [None]:
x = np.arange(0, 4, 1)
y = np.arange(100,104, 1)
print(x)
print(y)

In [None]:
np.concatenate([x,y])

In [None]:
np.concatenate([x,y], axis = 0)

In [None]:
np.concatenate([x,y], axis = 1) # throws an error

In [None]:
x.shape # you can't use axis with index 1, because axis index 1 does not exist

In [None]:
np.vstack([x,y])   # vstack will vertically stack unidimensional arrays

In [None]:
xm = np.arange(0,6,1).reshape([2,3])
ym = np.arange(100,106,1).reshape([2,3])
print(xm)
print(ym)

In [None]:
xm.shape

In [None]:
ym.shape

In [None]:
print(np.concatenate([xm,ym]))

In [None]:
print(np.concatenate([xm,ym], axis = 0))  
# axes are reported as rows, then columns.
# concatenating along axis 0 will concatenate along rows

In [None]:
print(np.concatenate([xm,ym], axis = 1))
# concatenating along axis 1 will concatenate along columns

In [None]:
np.vstack([xm, ym])

In [None]:
np.hstack([xm, ym])

## Math Operators with numpy arrays

In [None]:
print(x)
print(y)
print(xm)
print(ym)

In [None]:
x + 5

In [None]:
x + y

In [None]:
xm + 5

In [None]:
xm + ym

In [None]:
np.dot(x,y)   # 0 * 100 + 1 * 101 + 2 * 102 + 3 * 103

In [None]:
print(xm)
print(ym)

In [None]:
x * y 

In [None]:
xm * ym

# Basic Math

In [None]:
x = np.arange(4)
print(x)

In [None]:
print(x + 5)

In [None]:
print(x - 5)

In [None]:
print(x * 2)

In [None]:
print(x / 2)

In [None]:
print(-x)

In [None]:
print(x ** 2)

In [None]:
print(x % 2) # modulo division

In [None]:
print(abs(x)) # abs

# Trig functions
note that the functions are preceeded by np.

In [None]:
theta = np.linspace(0, np.pi, 5)
print(theta)

In [None]:
print(np.sin(theta))

In [None]:
print(np.cos(theta))

In [None]:
print(np.tan(theta))

# Log and Exp

In [None]:
x = np.array([1, 10, 100])
print(np.log(x))   # natural log
print(np.log10(x)) # common log

In [None]:
y = np.arange(3)
print(np.exp(y))  # e^y

In [None]:
print(np.exp2(y))  # 2^y

In [None]:
print(np.power(3, y)) # power ^ y

# Aggregates

you can use `sum()`

or `np.sum()`

`np.sum()` is faster than sum, but doesn't always behave the same way

In [None]:
x = np.arange(100)
print(x)

In [None]:
print(sum(x))

In [None]:
print(np.sum(x))

In [None]:
big_array = np.random.rand(10000)
%timeit sum(big_array)
%timeit np.sum(big_array)  # the np version is much faster

## min and max

In [None]:
print(min(big_array))
print(max(big_array))

In [None]:
print(np.min(big_array))
print(np.max(big_array))

In [None]:
%timeit min(big_array)
%timeit np.min(big_array)  # the np version is much faster

## summaries for matrices

In [None]:
np.random.seed(1)
# M = np.random.random((3, 4))
M = np.arange(12)
np.random.shuffle(M)
M = np.reshape(M, [3,4])
print(M)

In [None]:
sum(M) # regular sum function

In [None]:
np.sum(M) # np.sum function

In [None]:
np.sum(M, axis = 0)  # np.sum function with axis specified
# matrices have two dimensions
# 0 is rows, 1 is columns
# np.sum axis = 0, will sum over rows, so you end up getting column totals

In [None]:
np.sum(M, axis = 1)

In [None]:
np.min(M, axis = 0)

In [None]:
np.std(M)

In [None]:
np.std(M, axis = 0)

## dealing with nan
nan is the float value for something that is not a number. We often use it in the place of a missing value.
nan only exists in float type.

In [None]:
x = float("nan")  # direct creation of nan
print(x)
print(type(x))

In [None]:
y = float("inf")  # y is the float representation of infinity
print(y / y)  # these calculations will yield a nan result
print(y - y)

In [None]:
np.sum([x, 2])

In [None]:
np.nansum([x, 2])   # in R you have the option na.rm = TRUE

The following table provides a list of useful aggregation functions available in NumPy:

|Function Name      |   NaN-safe Version  | Description                                   |
|-------------------|---------------------|-----------------------------------------------|
| ``np.sum``        | ``np.nansum``       | Compute sum of elements                       |
| ``np.prod``       | ``np.nanprod``      | Compute product of elements                   |
| ``np.mean``       | ``np.nanmean``      | Compute mean of elements                      |
| ``np.std``        | ``np.nanstd``       | Compute standard deviation                    |
| ``np.var``        | ``np.nanvar``       | Compute variance                              |
| ``np.min``        | ``np.nanmin``       | Find minimum value                            |
| ``np.max``        | ``np.nanmax``       | Find maximum value                            |
| ``np.argmin``     | ``np.nanargmin``    | Find index of minimum value                   |
| ``np.argmax``     | ``np.nanargmax``    | Find index of maximum value                   |
| ``np.median``     | ``np.nanmedian``    | Compute median of elements                    |
| ``np.percentile`` | ``np.nanpercentile``| Compute rank-based statistics of elements     |
| ``np.any``        | N/A                 | Evaluate whether any elements are true        |
| ``np.all``        | N/A                 | Evaluate whether all elements are true        |

## Broadcasting

This is a similar concept to recyling values in R, but only works when the dimensions are compatible

In [None]:
a = np.array([1,2,3])
b = np.array([4,5,6])
print(a + b)

In [None]:
c = np.array([7,8])
print(a + c)  # doesn't work

In [None]:
print(a)

In [None]:
e = np.ones([3,3])
print(e)

In [None]:
print(e + a)  # the array a gets 'broadcast' across all three rows

In [None]:
print(a.reshape([3,1]))  # we reshape a to be a 3x1 array

In [None]:
print(e + a.reshape([3,1])) # the reshaped array is broadcast across columns

In [None]:
d = np.vstack([a,b])  # we stack the arrays a and b vertically
print(d)

In [None]:
a

In [None]:
print(d + a)  # a is broadcast across row

In [None]:
print(c)

In [None]:
print(d + c)  # c does not have compatible dimensions

In [None]:
print(d + c.reshape([2,1]))  # after we reshape c to be a column, we can broadcast it

In [None]:
e = np.arange(10).reshape((10, 1))
f = np.arange(11)
print(e)
print(f)

In [None]:
print(e + f)  ## e and f are broadcast into compatible matrices and then added

In [None]:
print(e * f)  ## e and f are broadcast into compatible matrices and then multiplied element-wise

In [None]:
d.reshape((1,6)) + d.reshape((6,1))

# Boolean Operators in NumPy

In [None]:
x = np.arange(6)
print(x)

In [None]:
print(x < 3)

In [None]:
print(x >= 3)

In [None]:
print(x == 3)

In [None]:
# the results can then be used to subset
print(x[x >= 3])

In [None]:
np.sum(x >= 3) # True = 1, False = 0, so sum counts how many are true

In [None]:
np.mean(x >= 3)  # finds the proportion that is True

### Working with matrices

In [None]:
y = np.arange(12).reshape([3,4])
print(y)

In [None]:
print(y >= 6)

In [None]:
np.sum(y >= 6)

In [None]:
np.sum(y >= 6, axis = 0)  # you can perform sums and other aggregate functions axis-wise on the boolean matrix

In [None]:
np.sum(y >= 6, axis = 1)

## Bitwise (element-wise) Boolean operators

In [None]:
a = np.array([True, True, False, False])
b = np.array([True, False, True, False])
print(a)
print(b)

In [None]:
print(a & b) # bitwise and

In [None]:
print(a | b) # bitwise or

In [None]:
print(a ^ b) # bitwise xor (exclusive or)

In [None]:
print(~a)  # bitwise not

In [None]:
np.any(a)

In [None]:
np.all(a)

# fancy indexing
Regular lists in python do not support fancy indexing, but NumPy does!

In [None]:
np.random.seed(1)
x = np.random.randint(100, size = 10)
print(x)

In [None]:
index = [0, 1, 5]
print(x[index])

In [None]:
a = [1, 4, 7]
b = [2, 3, 8]
ind = np.vstack([a,b])
print(ind)

In [None]:
print(x[ind])

In [None]:
X = np.arange(12).reshape((3, 4))
print(X)

In [None]:
row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
X[row, col]

# sorting

- `np.sort()` is like sort() in R
- `np.argsort()` is like order in R. It gives the indexes of the values to have the proper sorting

In [None]:
np.random.seed(2)
x = np.arange(5)
np.random.shuffle(x)
print(x)

In [None]:
x.sort() # sorts x in place
print(x)

In [None]:
y = np.array([5, 2, 1, 4])
print(y)
print(y.argsort())

In [None]:
d = y.argsort()
y[d]

## Sorting along rows or columns

A useful feature of NumPy's sorting algorithms is the ability to sort along specific rows or columns of a multidimensional array using the axis argument. For example:

In [None]:
np.random.seed(1)
X = np.random.randint(0, 10, (4, 6))
print(X)

In [None]:
# sort each column of X
# np.sort returns a copy of X after sorted. It does not modify X
np.sort(X, axis=0)

In [None]:
# sort each row of X
np.sort(X, axis=1)