In [92]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [93]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [94]:
X, y = mnist["data"], mnist["target"]

In [95]:
y = y.astype(np.uint8)

In [96]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [97]:
enc.fit(y[:,np.newaxis])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [98]:
Y = enc.transform(y[:,np.newaxis]).toarray()

In [99]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [100]:
X_train = X_train / 255
X_test = X_test / 255

In [101]:
X_val = X_train[50000:]
y_val = y_train[50000:]
X_train = X_train[:50000]
y_train = y_train[:50000]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(50000, 784) (50000, 10) (10000, 784) (10000, 10)


In [102]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [103]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

### L2 정규화
아래의 구현에서 처럼 기존에 구해놓은 cost값에서 regularization을 위한 값을 빼주면 새로운 코스트를 구할 수 있다.

In [104]:
def compute_cost(X, T, W, lambda_):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    cost -= (lambda_ / (N * 2.)) * np.sum(np.square(W))
    return cost

In [105]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [111]:
def batch_gd(X, T, W, learning_rate, iterations, lambda_ ,batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W, lambda_)
        #if i % 1000 == 0:
        #    print(cost_history[i][0])

    return (cost_history, W)

### Lambda구하기
입력 데이터를 10000장으로 줄인 X_val을 입력값으로 하고 iteration을 10000으로 조정하여 lambda값을 구한다. lambda의 값을 0.005씩 늘려주면서 가장 높은 점수를 기록하는 lambda값을 찾고 이 값을 통해 모델을 트레이닝 시킨후 스코어를 구하면, 기존의 값보다 높아졌음을 알 수 있다.

In [122]:
X = np.hstack((np.ones((np.size(X_val, 0),1)),X_val))
T = y_val
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 10000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W, 0)

lambda_ = 0
max_score = 0
max_lambda = 0
while(lambda_ <= 1):
    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, lambda_, 64)
    y_pred = predict(X_, W_optimal)
    score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))
    print("score: {}, lambda:{}".format(score, lambda_))
    if max_score < score:
        max_score = score
        max_lambda = lambda_
    lambda_ += 0.005
print("max score: {}, lambda: {}".format(max_score, max_lambda))

score: 0.8967, lambda:0
score: 0.8944, lambda:0.005
score: 0.8935, lambda:0.01
score: 0.8807, lambda:0.015
score: 0.8954, lambda:0.02
score: 0.8926, lambda:0.025
score: 0.892, lambda:0.030000000000000002
score: 0.8965, lambda:0.035
score: 0.891, lambda:0.04
score: 0.8964, lambda:0.045
score: 0.8932, lambda:0.049999999999999996
score: 0.8949, lambda:0.05499999999999999
score: 0.888, lambda:0.05999999999999999
score: 0.8974, lambda:0.06499999999999999
score: 0.8937, lambda:0.06999999999999999
score: 0.8956, lambda:0.075
score: 0.8934, lambda:0.08
score: 0.8842, lambda:0.085
score: 0.8973, lambda:0.09000000000000001
score: 0.9, lambda:0.09500000000000001
score: 0.8953, lambda:0.10000000000000002
score: 0.8935, lambda:0.10500000000000002
score: 0.8923, lambda:0.11000000000000003
score: 0.8957, lambda:0.11500000000000003
score: 0.8953, lambda:0.12000000000000004
score: 0.8945, lambda:0.12500000000000003
score: 0.8964, lambda:0.13000000000000003
score: 0.8898, lambda:0.13500000000000004
scor

In [123]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W, max_lambda)

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, max_lambda,64)

In [124]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)

0.91
