In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
X, y = mnist["data"], mnist["target"]

In [5]:
y = y.astype(np.uint8)

In [6]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [7]:
enc.fit(y[:,np.newaxis])

  enc.fit(y[:,np.newaxis])


OneHotEncoder()

In [8]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  Y = enc.transform(y[:,np.newaxis]).toarray()


In [9]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [10]:
X_train = X_train / 255
X_test = X_test / 255

In [11]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

### Softmax 공식

$$ 
diag(A_{exp} \mathbb{1}_k)^{-1} A_{exp} 
= \begin{bmatrix}
{1 \over \sum_{j}exp(a_{1j})} & \cdots & 0 \\
\vdots & \ddots & \vdots \\
0 & \cdots & {1 \over \sum_{j}exp(a_{Nj})}
\end{bmatrix} 
\begin{bmatrix}
- \ {exp(a_1^T)} \ - \\ \vdots
\end{bmatrix}
= Y
$$

In [12]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

### E(w) 공식

$$ E(w) = {1 \over N} \sum_{n=1}^{N} {E_n{w}} = -{1 \over N} \sum_{n=1}{N} \sum_{k=1}^{K} t_{nk} \ln{y_{nk}} = -{1 \over N} \{ \mathbb{1}_N^T (\ln(Y) \circ T) \mathbb{1}_k \} $$

In [13]:
def compute_cost(X, T, W, lambda_=0):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    
    reg_l2 = (lambda_ / 2) *  np.sum(np.dot(W.T, W))
    
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + reg_l2
    
    return cost

In [14]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

### W의 Update 공식

$$ W = W - \{learning\_rate * {{1 \over N} \Phi^T (Y - T)}\} $$

In [15]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, val_ratio, lambda_):
    N = len(T)
    cost_history = np.zeros((iterations+1,1))
    shuffled_indices = np.random.permutation(N)
    X_total    = X[shuffled_indices]
    T_total    = T[shuffled_indices]
    
    val_N      = int(N * (1 - val_ratio))
    X_shuffled = X_total[:val_N]
    T_shuffled = T_total[:val_N]
    X_eval     = X_total[val_N:]
    T_eval     = T_total[val_N:]

    for i in range(iterations+1):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
            
        ## Update
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W, lambda_)
        if i % 10000 == 0:
            print("ep :", i, cost_history[i][0], compute_cost(X_eval, T_eval, W, lambda_))

    return (cost_history, W)

In [16]:
lambdas = [0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.5, 0.8, 1]

for l in lambdas : #{
    X = np.hstack((np.ones((np.size(X_train, 0),1)), X_train))
    T = y_train

    K = np.size(T, 1)
    M = np.size(X, 1)
    W = np.zeros((M,K))

    iterations    = 50000
    learning_rate = 0.01
    val_ratio     = 0.2
    lambda_       = 0

    initial_cost  = compute_cost(X, T, W)

    print(f'\n>> lambda : {l}')
    print("Initial Cost is: {}".format(initial_cost[0][0]))

    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, val_ratio, lambda_)
#}


>> lambda : 0
Initial Cost is: 2.3024850979937352
ep : 0 2.2819687302428764 [[2.29253048]]
ep : 10000 0.3008118929597312 [[0.38656275]]
ep : 20000 0.4447022748071709 [[0.3511305]]
ep : 30000 0.20286393122143603 [[0.33815048]]
ep : 40000 0.43355588100228193 [[0.3281135]]
ep : 50000 0.032138960771842764 [[0.42100292]]

>> lambda : 1e-06
Initial Cost is: 2.3024850979937352
ep : 0 2.2815077077996357 [[2.29118186]]
ep : 10000 0.18662060560379562 [[0.37738019]]
ep : 20000 0.36443268697637227 [[0.34221517]]
ep : 30000 0.25436704765714513 [[0.32104692]]
ep : 40000 0.3129903183990167 [[0.31250543]]
ep : 50000 0.029317246023465634 [[0.38386871]]

>> lambda : 1e-05
Initial Cost is: 2.3024850979937352
ep : 0 2.2696098602217756 [[2.29243081]]
ep : 10000 0.6203805344682827 [[0.37195958]]
ep : 20000 0.21336199165159864 [[0.33822009]]
ep : 30000 0.4060430679151972 [[0.31366937]]
ep : 40000 0.2502876041782785 [[0.31620549]]
ep : 50000 0.02542215244103693 [[0.34896378]]

>> lambda : 0.0001
Initial Cost

In [17]:
X = np.hstack((np.ones((np.size(X_train, 0),1)), X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations    = 50000
learning_rate = 0.01
val_ratio     = 0.2
lambda_       = 1e-5

initial_cost  = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, val_ratio, lambda_)

Initial Cost is: 2.3024850979937352 

ep : 0 2.281500293625614 [[2.29271853]]
ep : 10000 0.3333691120750739 [[0.39291427]]
ep : 20000 0.30398021754341126 [[0.33905457]]
ep : 30000 0.4718158897978205 [[0.31760922]]
ep : 40000 0.2872860418810027 [[0.31942722]]
ep : 50000 0.023277707670544018 [[0.35606077]]


In [18]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)

0.9
