# numpy vector tooling in ML preprocessing

numpy slices are views into the same data
list slices are a new list (copy of references)

this is important because we slice and preprocess constantly in ML!

In [None]:
import numpy as np

x = np.array([0.2, -1.5, 3.0, 0.0, 2.2], dtype=np.float32)
print(x)
print(x[0], x[-1]) # returns scalars, not floats
print(x[1:4]) # returns numpyarray with shape (3,)

y = x[1:4] # usually a view - writing into y = writing into x
y[0] = 999
print("y:", y)
print("x", x)

[ 0.2 -1.5  3.   0.   2.2]
0.2 2.2
[-1.5  3.   0. ]
y: [999.   3.   0.]
x [2.00e-01 9.99e+02 3.00e+00 0.00e+00 2.20e+00]


In [3]:
print("shares memory?", np.shares_memory(x, y))
print("y.base is x?", y.base is x)

shares memory? True
y.base is x? True


# slicing often shares memory, which is powerful and dangerous
we have the option of working on copies:

In [4]:
x = np.array([0.2, -1.5, 3.0, 0.0, 2.2], dtype=np.float32)
y = x[1:4].copy()
y[0] = 999

print("y:", y)
print("x:", x)
print("shares memory?", np.shares_memory(x, y))

y: [999.   3.   0.]
x: [ 0.2 -1.5  3.   0.   2.2]
shares memory? False


In [6]:
# boolean masks
x = np.array([0.2, -1.5, 3.0, 0.0, 2.2], dtype=np.float32)

mask = x > 0
# im assuming it'll filter out 0s and negatives?
# cool you can do that in python i guess

print(mask)
print(x[mask])

[ True False  True False  True]
[0.2 3.  2.2]


In [None]:
# masking returns a copy!
# slicing gives a view

pos = x[x > 0]
pos[0] = 999

print("pos:", pos)
print("x:", x)
print("shares memory?", np.shares_memory(pos, x))

pos: [999.    3.    2.2]
x: [ 0.2 -1.5  3.   0.   2.2]
shares memory? False


In [None]:
# numpy methods

# mean - useful for same transformation
x = np.array([1, 4, 1, 3.0, 3.2, 10], dtype=np.float32)
mu = x.mean()
print(mu)

3.7


In [None]:
# std gives standard deviation (how spread out values are)
# ML models behave better when features are scaled similarly
# lower is not necessarily better - you may want low or high spread

# standardization in preprocessing is just to get rid of units

sigma = x.std()
print(sigma)

# z standardization
# z scores
z = (x - mu) / sigma
print(z)

3.0304015
[-0.89097106  0.09899677 -0.89097106 -0.23099251 -0.16499464  2.0789325 ]


In [None]:
print(z.mean())
print(z.std())

0.0
1.0


# standardization
x = training vector
mu, sigma via .mean(), .std()

x_new = different vector (future input, new test sample, etc)
# x_new must be transformed using the same mu and sigma!
standardization is a coordinate change
models learn patterns in "z-space"
new inputs must be standardized with the same mean / std otherwise
models are fed a different coordinate system. it would be like
training with meters and testing with inches

z_new = (x_new - mu) / sigma

In [None]:
# arg max - which index has the biggest value?

scores = np.array([0.1,-2.0,3.4,3.3], dtype=np.float32)
print(scores)
idx = scores.argmax()

print(idx) # note, indices start at 0
print(scores[idx]) # winning score; how strong the model scores that class relative to others

[ 0.1 -2.   3.4  3.3]
2
3.4


In [None]:
scores2 = np.array([1.0, 5.0, 5.0, 2.0], dtype=np.float32)
print(scores2.argmax()) # returns the first occurence of the maximum

1
