In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns
from sklearn.model_selection import train_test_split
import random

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

  enc.fit(y[:,np.newaxis])


OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

  Y = enc.transform(y[:,np.newaxis]).toarray()


In [8]:
# X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

# train, test, valid split
# train 60%, test 20%, valid 20%
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
# X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5)

X_train, X_valid, X_test, y_train, y_valid, y_test = X[:42000], X[42000:56000], X[56000:], Y[:42000], Y[42000:56000], Y[56000:]


In [9]:
X_train = X_train / 255
X_test = X_test / 255
X_valid = X_valid /255 # validation set

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [12]:
# L2 regularization
def compute_cost(X, T, W, lambd):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1) 
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + (np.linalg.norm(W, ord=2)**2)*(lambd)/2
    return cost

In [13]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [14]:
# gradient 계산에 비용함수 반영
def batch_gd(X, T, W, learning_rate, iterations, batch_size, lambd):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + W*lambd)
#         cost_history[i] = compute_cost(X_batch, T_batch, W, lambd)
        if i % 1000 == 0:
            cost_history = compute_cost(X_batch, T_batch, W, lambd)
            print(cost_history)

    return (cost_history, W)

In [15]:
# Regularization을 위한 가중치 lambda를 튜닝
best_lambd = 0
best_score = 0
X = np.hstack((np.ones((np.size(X_valid, 0),1)),X_valid))
T = y_valid

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

for lambd in range(10):
    lambd = random.uniform(0,0.01)
    print("lambda: ", lambd)
    initial_cost = compute_cost(X, T, W, lambd)
    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, lambd)
    
    ## Accuracy
    X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
    T_ = y_test
    y_pred = predict(X_, W_optimal)
    score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))
    
    print("score: ", score)
    
    if score > best_score:
        best_score = score
        best_lambd = lambd
print("best score: ", best_score)
print("best lambda: ", best_lambd)


lambda:  0.00969405027712516
[[2.28089709]]
[[0.64343334]]
[[0.55232091]]
[[0.42151459]]
[[0.45281326]]
[[0.30402932]]
[[0.43888521]]
[[0.44042627]]
[[0.41963401]]
[[0.37950515]]
[[0.29549396]]
[[0.33599236]]
[[0.34745738]]
[[0.38134414]]
[[0.43862299]]
[[0.44772082]]
[[0.42055763]]
[[0.37187197]]
[[0.40307998]]
[[0.27260287]]
[[0.42571159]]
[[0.43534326]]
[[0.40387635]]
[[0.37339211]]
[[0.28664373]]
[[0.30925039]]
[[0.35719848]]
[[0.3902503]]
[[0.43025937]]
[[0.44475086]]
[[0.40246873]]
[[0.38694756]]
[[0.41061186]]
[[0.28443305]]
[[0.43273766]]
[[0.44704317]]
[[0.41131619]]
[[0.38447308]]
[[0.29629826]]
[[0.30509371]]
[[0.3674077]]
[[0.40285636]]
[[0.43405104]]
[[0.44829322]]
[[0.39998864]]
[[0.40247323]]
[[0.42151576]]
[[0.29802961]]
[[0.4404954]]
[[0.4589101]]
score:  0.9151428571428571
lambda:  0.004591980349869061
[[2.2844041]]
[[0.50998123]]
[[0.67913675]]
[[0.4317515]]
[[0.3675784]]
[[0.40180991]]
[[0.32177479]]
[[0.45077918]]
[[0.56383874]]
[[0.37809819]]
[[0.39765134]]
[[0.25