# Softmax with Numpy

In [14]:
import sklearn
import numpy as np

In [15]:
from sklearn.datasets import load_iris
datasets = load_iris()

In [16]:
x_data = datasets["data"]
x_data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [17]:
y_data = datasets["target"]
y_data

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [18]:
y_data = y_data.reshape([-1,1])
y_data[:3]

array([[0],
       [0],
       [0]])

In [19]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(y_data)
y_data = enc.transform(y_data).toarray()
y_data[:3]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [20]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
x_data_minmax = min_max_scaler.fit_transform(x_data)
x_data_minmax[:3]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667]])

In [21]:
x_0 = np.ones(x_data_minmax.shape[0])
x_data_minmax = np.column_stack((x_0, x_data_minmax))

x_data_minmax[:3]

array([[1.        , 0.22222222, 0.625     , 0.06779661, 0.04166667],
       [1.        , 0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [1.        , 0.11111111, 0.5       , 0.05084746, 0.04166667]])

In [22]:
weights = np.random.uniform(size = (3, 5))
weights

array([[0.94414395, 0.30207917, 0.89048229, 0.64338334, 0.41383933],
       [0.14859301, 0.62325635, 0.16024632, 0.119151  , 0.5300794 ],
       [0.08860008, 0.82127508, 0.69798108, 0.74528909, 0.12224229]])

In [23]:
def softmax(z):
  e = np.exp(z)
  p = e / np.sum(np.exp(z), axis = 1).reshape([-1, 1])
  return p

In [26]:
z = x_data_minmax.dot(weights.T)
z[:3], z.shape

(array([[1.6286866 , 0.41741305, 0.76296533],
        [1.42638728, 0.34940304, 0.57192621],
        [1.47290716, 0.32611204, 0.57183289]]),
 (150, 3))

In [25]:
softmax(z[:2])

array([[0.58188057, 0.17329436, 0.24482507],
       [0.56620856, 0.19286265, 0.24092879]])

In [27]:
def cross_entropy_function(y, x, weights):
  z = x_data_minmax.dot(weights.T)
  result = -np.sum(
             np.sum(
              (y * np.log(softmax(z))), axis = 1).reshape((-1,1))
             )
  return result # 상수

In [28]:
from audioop import cross


def minimize_gradient(y, x, initial_weights, iterations = 500000, alpha = 0.001):
  cost_history = []
  theta_history = []
  m = y.shape[0]
  theta = np.copy(initial_weights)

  number_of_classes = theta.shape[0]
  number_of_weights = theta.shape[1]

  for _ in range(iterations):
    original_theta = np.copy(theta)
    for k in range(number_of_classes):
      for j in range(number_of_weights):
        partial_x = x[:, j]
        partial_entropy = y - softmax(x.dot(original_theta.T))
        theta[k][j] = original_theta[k][j] + (alpha * partial_entropy[:, k].dot(partial_x.T)) / 150
    
    if (_ % 10000) == 0:
      print(cross_entropy_function(y, x, theta) / 150)
      cost_history.append(cross_entropy_function(y, x, theta))
  
  return theta, cost_history

In [29]:
theta, cost_history = minimize_gradient(y_data, x_data_minmax, weights)

1.1894210677880528
0.7172846225460702
0.596598220225385
0.5288309340388052
0.48390221002720374
0.45097370781978935
0.42524683341127
0.404256840124628
0.3865941502905619
0.37138569324128334
0.3580567812269285
0.3462107003616247
0.33556296546415026
0.3259031219731775
0.31707139243037036
0.3089437808727984
0.30142222737353397
0.2944279003142669
0.28789650572397485
0.28177493199297127
0.2760188016100256
0.27059065299892926
0.2654585688870339
0.26059512678988433
0.2559765855995676
0.2515822477466811
0.24739395364882577
0.24339567703465315
0.2395731980446635
0.23591383691360052
0.232406235289005
0.229040175337735
0.2258064290754867
0.2226966320554574
0.21970317683231444
0.21681912258963715
0.2140381180636634
0.2113543354711537
0.20876241359671055
0.2062574085456441
0.20383475094528333
0.20149020859756797
0.19921985376147744
0.1970200343851565
0.1948873487218472
0.19281862285653395
0.1908108907460882
0.1888613764379136
0.18696747818345708
0.18512675420544944


In [30]:
rand_index = np.random.randint(0, 150, 30)

In [31]:
y_pred = np.argmax(softmax(x_data_minmax[rand_index].dot(theta.T)), axis = 1)
y_pred

array([0, 1, 1, 0, 2, 1, 2, 0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 1, 1, 1, 2, 2,
       2, 1, 2, 0, 0, 2, 2, 2], dtype=int64)

In [32]:
y_true = np.argmax(y_data[rand_index], axis = 1)
y_true

array([0, 1, 1, 0, 2, 1, 2, 0, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 1, 1, 2, 2,
       2, 1, 2, 0, 0, 2, 2, 2], dtype=int64)

In [33]:
sum(y_pred == y_true) / len(rand_index) # Accuracy

0.9666666666666667

---

# Metrics for Multiclass

- Accuracy : 전체 class 중 정확히 일치한 class의 개수
- Precision : TP / (TP + FP) (= 실제P / 예측P)
  - ex) Precision A = A라고 예측한 것들 중 실제 A인 것의 비율
  - macro 방식 : 클래스별로 Precision을 구해서 평균을 낸 것.
  - micro 방식 : 맞춘 것과 못 맞춘 것으로 나누어 전체 클래스에서 Precision을 구하는 것.
- Recall : TP / (TP + FN) (= 예측P / 실제P)
  - ex) Recall A = 실제 A인 것들 중 A라고 예측한 것의 비율
  - macro, micro 방식 계산은 Precision과 동일