# softmax - vector scores into a probability distribution

every output is in (0, 1) - strictly positive
all outputs sum to 1

softmax(s_i) = e^(s_i) / sum of e^s_j

In [4]:
import numpy as np
scores = np.array([0.1, -2.0, 3.4, 3.3], dtype=np.float32)
exps = np.exp(scores)
probs = exps / exps.sum()
print(probs)
print(probs.sum())

[0.01895101 0.00232067 0.513812   0.4649163 ]
1.0


softmax = normalized scores into a probability distribution (sum = 1)
exponentials preserve order and amplify gaps
normalization lets us interpret these a probabilities

SCORES ARE NOT Z-SCORES
scores = model outputs (logits)

# numerical stabilitiy:
exponentials can overflow if scores are large

In [5]:
big = np.array([1000.0, 1001.0, 1002.0], dtype=np.float32)
print(np.exp(big))

[inf inf inf]


  print(np.exp(big))


# stability trick: 
subtract the max score before exponentiating - doesn't change the softmax result

In [None]:
shifted = big - big.max() # subtract max score from EACH ELEMENT in array

# subtracting big max guarantees largest shifted value is 0

exps = np.exp(shifted) # exponent of each element in new shifted array  
probs = exps / exps.sum() # softmax 
print(probs) # will print identical probabilities, because subtracting the same constant
# from every element cancels out the subtracted term

[0.09003057 0.24472848 0.66524094]


# cross entropy loss

1. what is a true class index?
- classification involves fixed set of classes like
    a. class 0: cat
    b. class 1: dog
    c. class 2: fox
    d. class 3: bear
- models output one score per class
- the correct answer for training = true class index: label represented by an integer
pointing to the correct position in the score vector

2. softmax turns these raw scores into probabilities summing 1
- tells us how confident a model is in each class

3. log usage
- we want a loss that gets smaller as the model assigns higher probabilities to the correct class
- gets bigger when correct classes have low probbility
- e.g: p(y) = model returns correct class
    - py = 1, perfect
    - py = 0.01, really bad
- loss = -log(py)
    - log(1) = 0; perfect prediction gives loss 0
    - log(0.01) = big negative; big penalty loss

can be used later in calculus for gradient based learning

# loss = single number measuring how wrong the model is on an example
training is just making loss smaller

In [7]:
import numpy as np
scores = np.array([0.1, -2.0, 3.4, 3.3], dtype=np.float32)
y = 2

shifted = scores - scores.max()
probs = np.exp(shifted) / sum(np.exp(shifted))
print(probs)
print(probs.sum())


[0.01895101 0.00232067 0.513812   0.4649163 ]
1.0


In [None]:
p_correct = probs[y] # access correct value, store
loss = -np.log(p_correct) # gives us loss - different value per example
print(p_correct, loss)

0.513812 0.66589785


# loss is like damage points
every training example hits model w/ some damage based on how "wrong" it was

p_correct is a probability post softmax
loss will only compare different probabilities 0-1
-log(p_correct) converts probabilitiy into an inversely related penalty (one input, model outputs probabilities for each class, p_correct = probability of sitting at the correct value in this probability vector)

In [None]:
scores_big = scores * 10
shifted_big = scores_big - scores_big.max()
probs = np.exp(shifted_big)/sum(np.exp(shifted_big))
p_correct = probs[2]
print(p_correct) # scaling logits up made softmax more confident

# temperature in disguise: bigger scale (lower temperature) makes 
# probabilities more peaked, smaller scale makes them flatter

0.7310586
