# `numpy` and `pandas`

In [1]:
# Libraries
import math
import numpy as np
import random

## Getting started

https://numpy.org/doc/stable/user/whatisnumpy.html

In [2]:
# Construct an array
a = np.array([[1, 2, 3],
             [4, 5, 6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [3]:
# Get the number of dimensions
a.ndim

2

In [4]:
# Get the dimension sizes
a.shape

(2, 3)

In [5]:
# Get the number of elements
a.size

6

In [6]:
# Get the data type
a.dtype

dtype('int64')

In [7]:
# Construct an array from lists
b = (list(range(4)), list(range(5,9)), list(range(9,13)))
b

([0, 1, 2, 3], [5, 6, 7, 8], [9, 10, 11, 12])

In [8]:
# Concatenate two matrices along the row axis
c1 = np.concatenate((a, [[7,8,9]]), axis=0)
c1

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [9]:
# Stack two matrices vertically
c2 = np.vstack((a, [[7,8,9]]))
c2

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [10]:
# Concatenate two matrices along the column axis
c3 = np.concatenate((a, [[11],[14]]), axis=1)
c3

array([[ 1,  2,  3, 11],
       [ 4,  5,  6, 14]])

In [11]:
# Stack two matrices horizontally
c4 = np.hstack((a, [[11],[14]]))
c4

array([[ 1,  2,  3, 11],
       [ 4,  5,  6, 14]])

In [12]:
# Construct an array from explicit lists
d = np.array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])
d

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [13]:
d.shape

(3, 4)

In [14]:
d.ndim == len(d.shape)

True

In [15]:
d.size

12

In [16]:
d.size == math.prod(d.shape)

True

### Construct an array

In [17]:
# Construct an array of zeros
z = np.zeros(12)
z

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
z.reshape(4,3)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [19]:
# Construct an array of ones
o = np.ones(9)
o

array([1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [20]:
# Construct an array of unstable garbage from the memory
e = np.empty(16)
e

array([4.67296746e-307, 1.69121096e-306, 1.69120688e-306, 8.34441742e-308,
       1.78022342e-306, 6.23058028e-307, 9.79107872e-307, 6.89807188e-307,
       7.56594375e-307, 6.23060065e-307, 1.78021527e-306, 8.34454050e-308,
       1.11261027e-306, 1.15706896e-306, 1.33512173e-306, 1.33504432e-306])

In [21]:
# Construct an array from a range
r = np.arange(20)
r

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [22]:
# Construct an array from intervals
l = np.linspace(0,10, num=25) # num=number of intervals
l

array([ 0.        ,  0.41666667,  0.83333333,  1.25      ,  1.66666667,
        2.08333333,  2.5       ,  2.91666667,  3.33333333,  3.75      ,
        4.16666667,  4.58333333,  5.        ,  5.41666667,  5.83333333,
        6.25      ,  6.66666667,  7.08333333,  7.5       ,  7.91666667,
        8.33333333,  8.75      ,  9.16666667,  9.58333333, 10.        ])

In [23]:
# Specify the data type
np.empty(2, dtype=np.float32) # Also float or "float32"

array([0.    , 2.5625], dtype=float32)

In [106]:
# Construct a 3D array
e = np.array([[[0, 1, 2, 3],
              [4, 5, 6, 7]],
             [[0, 1, 2, 3],
              [4, 5, 6, 7]],
             [[0 ,1 ,2, 3],
              [4, 5, 6, 7]]])
e.ndim

3

In [25]:
# Construct a deep copy
a_c = a.copy()
a_c

array([[1, 2, 3],
       [4, 5, 6]])

### Reshaping

In [26]:
# Construct a vector
f = np.arange(12)
f

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [108]:
# Add an axis/dimension
fr = f[np.newaxis, :]
fr

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]])

In [28]:
fr.shape

(1, 12)

In [29]:
# Add an axis/dimension
np.expand_dims(f, axis=0)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]])

In [30]:
# Add an axis/dimension
fc = f[: ,np.newaxis]
fc

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11]])

In [31]:
fc.shape

(12, 1)

In [32]:
# Add an axis/dimension
np.expand_dims(f, axis=1)

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11]])

In [33]:
fc.reshape(3,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

### Slicing and dicing

In [34]:
l

array([ 0.        ,  0.41666667,  0.83333333,  1.25      ,  1.66666667,
        2.08333333,  2.5       ,  2.91666667,  3.33333333,  3.75      ,
        4.16666667,  4.58333333,  5.        ,  5.41666667,  5.83333333,
        6.25      ,  6.66666667,  7.08333333,  7.5       ,  7.91666667,
        8.33333333,  8.75      ,  9.16666667,  9.58333333, 10.        ])

In [35]:
# Filter an array
l[l > 5]

array([ 5.41666667,  5.83333333,  6.25      ,  6.66666667,  7.08333333,
        7.5       ,  7.91666667,  8.33333333,  8.75      ,  9.16666667,
        9.58333333, 10.        ])

In [36]:
l2 = l.reshape(5,5)
l2

array([[ 0.        ,  0.41666667,  0.83333333,  1.25      ,  1.66666667],
       [ 2.08333333,  2.5       ,  2.91666667,  3.33333333,  3.75      ],
       [ 4.16666667,  4.58333333,  5.        ,  5.41666667,  5.83333333],
       [ 6.25      ,  6.66666667,  7.08333333,  7.5       ,  7.91666667],
       [ 8.33333333,  8.75      ,  9.16666667,  9.58333333, 10.        ]])

In [37]:
l2[l2 > 5]

array([ 5.41666667,  5.83333333,  6.25      ,  6.66666667,  7.08333333,
        7.5       ,  7.91666667,  8.33333333,  8.75      ,  9.16666667,
        9.58333333, 10.        ])

In [38]:
l2[:,1]

array([0.41666667, 2.5       , 4.58333333, 6.66666667, 8.75      ])

In [39]:
l[l % 2 == 0]

array([ 0., 10.])

In [40]:
np.nonzero(l % 2 == 0)

(array([ 0, 24]),)

In [41]:
np.nonzero(l2 % 2 == 0)

(array([0, 4]), array([0, 4]))

### Converting existing data

In [42]:
v = np.empty(4).reshape(2,2)
w = np.empty(4).reshape(2,2)

In [43]:
np.hstack((v,w))

array([[0.00000000e+000, 1.07140671e-311, 0.00000000e+000,
        1.07140671e-311],
       [1.07140671e-311, 1.07140671e-311, 1.07140671e-311,
        1.07140671e-311]])

In [44]:
np.vstack((v,w))

array([[0.00000000e+000, 1.07140671e-311],
       [1.07140671e-311, 1.07140671e-311],
       [0.00000000e+000, 1.07140671e-311],
       [1.07140671e-311, 1.07140671e-311]])

In [45]:
np.split(v, 2)

[array([[0.00000000e+000, 1.07140671e-311]]),
 array([[1.07140671e-311, 1.07140671e-311]])]

In [46]:
x = np.arange(1, 25).reshape(2, 12)
x

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]])

In [47]:
np.hsplit(x, (3, 4)) # Split before col 3 and col 4

[array([[ 1,  2,  3],
        [13, 14, 15]]),
 array([[ 4],
        [16]]),
 array([[ 5,  6,  7,  8,  9, 10, 11, 12],
        [17, 18, 19, 20, 21, 22, 23, 24]])]

### Array operations

In [48]:
rng = np.random.default_rng() # Create random generator

Z = rng.random((4, 4))
Z = Z * 100
Z

array([[ 6.56157652, 18.99075297, 46.68357442, 21.29901205],
       [66.08822638, 94.83932736, 98.05348734, 42.9251781 ],
       [37.21002838,  9.66552261, 29.01322221,  6.84436699],
       [34.66322633, 35.16377629, 63.54913412, 15.29835818]])

In [49]:
sum(Z) # Sum along axis=0

array([144.52305761, 158.65937924, 237.29941809,  86.36691532])

In [50]:
Z.sum() # Sum all elements

np.float64(626.8487702609757)

In [51]:
Z - Z.mean()

array([[-32.61647162, -20.18729517,   7.50552628, -17.87903609],
       [ 26.91017823,  55.66127922,  58.8754392 ,   3.74712996],
       [ -1.96801976, -29.51252553, -10.16482593, -32.33368115],
       [ -4.51482181,  -4.01427185,  24.37108598, -23.87968996]])

In [52]:
Z_dot = np.dot(Z, Z) # (AB)[i,j] = Σ A[i,k] * B[k,j]
Z_dot

array([[ 3773.50907235,  3125.85435612,  4876.40209353,  1600.29598214],
       [11837.90410123, 12706.71275568, 17957.26693638,  6806.40774987],
       [ 2199.76435876,  2144.41986528,  3961.55598683,  1510.71584045],
       [ 5446.3125559 ,  5145.37329711,  7882.09676468,  2916.69719428]])

In [53]:
Z_inner = np.inner(Z, Z) # inner(A, B)[i,j] = Σ A[i,k] * B[j,k]
Z_inner

array([[ 3036.70701993,  7726.46435194,  1927.93117395,  4187.77264845],
       [ 7726.46435194, 24819.20897558,  6514.45973058, 12513.63900725],
       [ 1927.93117395,  6514.45973058,  2366.62096163,  3578.16863733],
       [ 4187.77264845, 12513.63900725,  3578.16863733,  6710.56263298]])

In [54]:
np.dot(Z, Z.T) == Z_inner

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

#### Linear regression

In [55]:
X = rng.random((10, 4))
X

array([[0.52740607, 0.22169566, 0.83663449, 0.71064961],
       [0.29672305, 0.04190438, 0.5714429 , 0.98370683],
       [0.48111786, 0.66416056, 0.67093231, 0.24097324],
       [0.44943225, 0.66951251, 0.16906302, 0.36772381],
       [0.69939845, 0.47813128, 0.05280468, 0.78289997],
       [0.31243492, 0.00411625, 0.79021543, 0.56844727],
       [0.54545347, 0.07281921, 0.36672492, 0.36404563],
       [0.8306393 , 0.8513401 , 0.81953197, 0.77307522],
       [0.92405514, 0.48815213, 0.93840818, 0.30013106],
       [0.808338  , 0.86581844, 0.68846893, 0.01033737]])

In [56]:
Xd = np.hstack((X, np.array([1] * X.shape[0]).reshape(X.shape[0],1)))

In [57]:
Gram = np.linalg.inv(
    np.dot(Xd.T, Xd)
)
Gram

array([[ 4.38952077, -1.84382443, -0.68742123,  0.08536729, -1.41305376],
       [-1.84382443,  2.04138278,  0.35130402,  0.59139184, -0.31546208],
       [-0.68742123,  0.35130402,  1.36342437,  0.14821598, -0.62984246],
       [ 0.08536729,  0.59139184,  0.14821598,  1.54626641, -1.18427482],
       [-1.41305376, -0.31546208, -0.62984246, -1.18427482,  2.04372533]])

In [58]:
Y = rng.random((10,1))
Y

array([[0.61738065],
       [0.24776985],
       [0.95646348],
       [0.02275315],
       [0.7090033 ],
       [0.73530624],
       [0.19012591],
       [0.27282907],
       [0.33553867],
       [0.38865767]])

In [59]:
beta = np.dot(
    Gram,
    np.dot(Xd.T, Y)
)
beta

array([[-0.33923015],
       [ 0.06811505],
       [ 0.24343327],
       [-0.02847351],
       [ 0.48799694]])

In [60]:
Y_hat = np.dot(Xd, beta)
Y_hat

array([[0.50761569],
       [0.50129248],
       [0.52649247],
       [0.41182502],
       [0.27387034],
       [0.55846901],
       [0.38683015],
       [0.44169729],
       [0.42767407],
       [0.44006148]])

In [61]:
e = Y - Y_hat
e

array([[ 0.10976497],
       [-0.25352263],
       [ 0.42997101],
       [-0.38907187],
       [ 0.43513297],
       [ 0.17683723],
       [-0.19670424],
       [-0.16886823],
       [-0.0921354 ],
       [-0.05140381]])

In [62]:
s = np.dot(e.T, e) / (Xd.shape[0] - Xd.shape[1])
s

array([[0.1423053]])

In [63]:
sI = s * np.diag([1] * 4)
sI

array([[0.1423053, 0.       , 0.       , 0.       ],
       [0.       , 0.1423053, 0.       , 0.       ],
       [0.       , 0.       , 0.1423053, 0.       ],
       [0.       , 0.       , 0.       , 0.1423053]])

In [64]:
sGram = s * Gram
sGram

array([[ 0.62465205, -0.26238598, -0.09782368,  0.01214822, -0.20108503],
       [-0.26238598,  0.29049958,  0.04999242,  0.08415819, -0.04489193],
       [-0.09782368,  0.04999242,  0.19402251,  0.02109192, -0.08962992],
       [ 0.01214822,  0.08415819,  0.02109192,  0.2200419 , -0.16852858],
       [-0.20108503, -0.04489193, -0.08962992, -0.16852858,  0.29083294]])

$$
SSE = \sum_{i=1}^N e_i^2
$$

In [65]:
SSE = (e ** 2).sum()
SSE

np.float64(0.711526482154541)

$$
SSR = \sum_{i=1}^N (\hat{Y}-\bar{Y})^2
$$

In [66]:
SSR = ((Y_hat - Y.mean()) ** 2).sum()
SSR

np.float64(0.06064425490747408)

$$
SST = \sum_{i=1}^N (Y-\bar{Y})^2
$$

In [67]:
SST = ((Y - Y.mean()) ** 2).sum()
SST == (SSR + SSE)

np.False_

In [68]:
np.isclose(SST, SSR + SSE)

np.True_

In [69]:
np.allclose(SST, SSR + SSE)

True

In [70]:
SST - (SSR + SSE)

np.float64(3.3306690738754696e-16)

In [71]:
np.dot(e.T, Y_hat)

array([[1.71553062e-15]])

$$
R^2=1-\frac{SSE}{SST}
$$

In [72]:
r_squared = 1 - (SSE / SST)
r_squared

np.float64(0.07853736485562246)

$$
Adj R^2=1-\frac{(1-R^2)(N-1)}{(N-p-1)}
$$

In [73]:
adjusted_rs = 1 - (((1 - r_squared) * (Xd.shape[0] - 1)) / (Xd.shape[0] - Xd.shape[1] - 1))
adjusted_rs

np.float64(-1.0732909290748496)

In [74]:
a = np.array([1, 2, 3])
b = np.array([10, 20])

### Broadcasting

| operation | code | broadcasting operation | maths equivalent |
|-----------|------|------------------------|------------------|
| $\mathbb{R}^{(1 \times 1)} * \mathbb{R}^{(1 \times n)}$ | `c * b` | stretch | scaling |
| $\mathbb{R}^{(1 \times n)} \times \mathbb{R}^{(1 \times n)}$ | `a * b` | | element-wise operation |

- two dimensions are compatible if they have an equal number of elements or either has a single element
- a dimension with no element has one by assumption
- compatibility is compared from the trailing dimensions leftwards
- broadcast operates either by scale ($c \times b_i$) or element-wise ($a_i \times b_i$)

#### Kronecker product

$$
A \otimes B
$$

In [75]:
kronecker = np.kron(a, b)
kronecker

array([10, 20, 20, 40, 30, 60])

In [76]:
a[:, None]

array([[1],
       [2],
       [3]])

In [77]:
a.reshape(3,1)

array([[1],
       [2],
       [3]])

In [78]:
broadcast = (a.reshape(3,1) * b)
broadcast

array([[10, 20],
       [20, 40],
       [30, 60]])

In [79]:
broadcast.ravel()

array([10, 20, 20, 40, 30, 60])

### Aggregate operations

In [80]:
v = np.linspace(1,140,20).round(3)
v = v.reshape(5,4)
v

array([[  1.   ,   8.316,  15.632,  22.947],
       [ 30.263,  37.579,  44.895,  52.211],
       [ 59.526,  66.842,  74.158,  81.474],
       [ 88.789,  96.105, 103.421, 110.737],
       [118.053, 125.368, 132.684, 140.   ]])

In [81]:
rng = np.random.default_rng()

w = rng.random(20).round(5) * 100
w

array([94.363, 92.106, 66.014, 61.706, 50.715, 78.801, 72.686, 70.359,
       54.401, 72.084, 71.258, 30.873,  4.708, 30.03 ,  4.714, 46.858,
       22.657, 99.222,  1.547, 20.949])

In [82]:
v.mean() == np.mean(v)

np.True_

In [83]:
v.std()

np.float64(42.184839320542636)

In [84]:
np.median(w)

np.float64(58.0535)

In [85]:
(w.min(), w.max())

(np.float64(1.547), np.float64(99.222))

In [86]:
v.sum()

np.float64(1410.0)

In [87]:
w.prod()

np.float64(1.4202965077957146e+31)

In [88]:
w = w.reshape(5,4)

w.std(axis=0)

array([30.59320395, 24.17871737, 32.84292669, 18.42304647])

### Matrices

In [89]:
d = 5

In [90]:
m = np.array([[random.randint(0,101) for j in range(d)] for i in range(d)]).reshape(d,d)
m

array([[51, 74, 83, 15, 63],
       [35, 82, 63, 26, 67],
       [59, 62, 96, 97, 45],
       [56, 56, 74, 10, 93],
       [ 6,  8, 17, 71, 78]])

In [91]:
# Generate a random array
# endpoint=True makes the upper bound inclusive
n = rng.integers(d, size=(d, d), endpoint=True)
n

array([[2, 0, 5, 4, 1],
       [2, 2, 4, 3, 3],
       [0, 3, 5, 4, 2],
       [5, 2, 4, 2, 2],
       [1, 4, 3, 1, 1]])

In [92]:
# Reduce the array to a set
np.unique(n)

array([0, 1, 2, 3, 4, 5])

In [93]:
# Get also the vector of indices
np.unique(n, return_index=True)

(array([0, 1, 2, 3, 4, 5]), array([1, 4, 0, 8, 3, 2]))

In [94]:
# Get also the vector of frequencies
np.unique(n, return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([2, 4, 7, 4, 5, 3]))

In [95]:
# np.unique() is shape insensitive
np.unique(n.reshape(n.size,), return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([2, 4, 7, 4, 5, 3]))

In [96]:
unique1, counts1 = np.unique(n[0], return_counts=True) 
unique2, counts2 = np.unique(n[0,:], return_counts=True)

counts1 == counts2

array([ True,  True,  True,  True,  True])

In [97]:
# Unique vals in the first col
np.unique(n[:,0], return_counts=True)

(array([0, 1, 2, 5]), array([1, 1, 2, 1]))

In [104]:
# Summary statistics
n.max(axis=0), n.max(axis=1), n.max()

(array([5, 4, 5, 4, 3]), array([5, 4, 5, 5, 4]), np.int64(5))

In [105]:
n.min(), n.sum()

(np.int64(0), np.int64(65))

In [117]:
# Construct a matrix
V = np.arange(1,7).reshape(3,2)
V

array([[1, 2],
       [3, 4],
       [5, 6]])

In [118]:
W = np.arange(11,17).reshape(2,3)
W

array([[11, 12, 13],
       [14, 15, 16]])

In [122]:
# Multiply the two matrices
A = V @ W
A

array([[ 39,  42,  45],
       [ 89,  96, 103],
       [139, 150, 161]])

In [124]:
# Construct and invert a 3x3 matrix
B = np.linalg.inv(
    rng.integers(A.size, size=A.shape, endpoint=True)
)
B

array([[ 0.15384615, -0.23076923,  0.23076923],
       [ 0.07692308,  0.38461538, -0.38461538],
       [-0.07692308, -0.05128205,  0.21794872]])

In [126]:
# Postmultiply by the transpose
A @ B.transpose() == A @ B.T

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [130]:
# A'A is symmetric and A'A=(A'A)'
A.T @ A == (A.T @ A).T

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [134]:
# Construct an idempotent matrix
C = np.array(
    [[4,-1],
    [12,-3]]
)
C

array([[ 4, -1],
       [12, -3]])

In [136]:
# Test whether C^2=C
C @ C == C

array([[ True,  True],
       [ True,  True]])

In [137]:
# Reverse an array along both axes
np.flip(C)

array([[-3, 12],
       [-1,  4]])

In [138]:
# Reverse an array along a specific axis
np.flip(C, axis=0)

array([[12, -3],
       [ 4, -1]])

In [140]:
# Reverse a row or col
C[:,0] = np.flip(C[:,0])
C

array([[12, -1],
       [ 4, -3]])

In [141]:
# Flatten an array in a copy
B.flatten()

array([ 0.15384615, -0.23076923,  0.23076923,  0.07692308,  0.38461538,
       -0.38461538, -0.07692308, -0.05128205,  0.21794872])

In [143]:
# Flatten an array in a view object
B.ravel()

array([ 0.15384615, -0.23076923,  0.23076923,  0.07692308,  0.38461538,
       -0.38461538, -0.07692308, -0.05128205,  0.21794872])

### Vectorised mean squared error

$$
MSE=\frac{1}{n}\sum_{i=1}^n \left(Y_i-\hat{Y}_i\right)^2
$$

In [149]:
# Get the length of the vector
n = Y.shape[0]
n

10

In [151]:
# Use the values from the linear regression above
(1 / n) * np.sum(np.square(Y - Y_hat))

np.float64(0.0711526482154541)