In [83]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()

In [84]:
X = iris['data'][:, (2, 3)]
y = iris['target']

In [85]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [86]:
X_with_bias = np.c_[X, np.ones(len(X))]

In [87]:
np.random.seed(2042)

In [88]:
def train_val_test_split(X, y, val_ratio=0.2, test_ratio=0.2):
    total_size = len(X)
    validation_size = int(val_ratio * total_size)
    test_size = int(test_ratio * total_size)
    train_size = total_size - test_size - validation_size
    rnd_indices = np.random.permutation(total_size)

    X_train = X_with_bias[rnd_indices[:train_size]]
    y_train = y[rnd_indices[:train_size]]
    X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
    y_valid = y[rnd_indices[train_size:-test_size]]
    X_test = X_with_bias[rnd_indices[-test_size:]]
    y_test = y[rnd_indices[-test_size:]]

    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [89]:
def one_hot(y):
    n_classes = y.max() + 1
    y_hot = np.zeros((len(y), n_classes))
    y_hot[np.arange(len(y)), y] = 1
    return y_hot

In [90]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y)

In [91]:
y_train_hot = one_hot(y_train)
y_val_hot = one_hot(y_val)
y_test_hot = one_hot(y_test)

In [92]:
def softmax(logits):
    top = np.exp(logits)
    bottom = np.sum(top, axis=1, keepdims=True)
    return top/bottom

In [93]:
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

In [94]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    y_proba = softmax(logits)
    if iteration % 500 == 0:
        loss = -np.mean(np.sum(y_train_hot * np.log(y_proba), axis=1))
        print(iteration, loss)
    error = y_proba - y_train_hot
    grad = (1/m) * X_train.T.dot(error)
    Theta -= (eta*grad)


0 3.2514153811867685
500 0.6943850541626947
1000 0.5988602456060218
1500 0.5387169376828606
2000 0.49714114278350374
2500 0.46620278764967515
3000 0.44189269007882365
3500 0.42201213208048566
4000 0.40526290464729764
4500 0.3908288285807467
5000 0.37816916830391956


In [95]:
X1 = np.array([[3,2],[1,4],[7,6]])
X2 = np.array([[7, 6], [2,1],[4,1]])

In [96]:
Theta

array([[-1.09425516,  0.41381588,  0.35147649],
       [-1.04001404, -0.89491519,  0.98521988],
       [ 3.90323678, -0.30106838, -2.92105073]])

In [97]:
logits = X_val.dot(Theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)
accuracy_score = np.mean(y_predict == y_val)
accuracy_score

0.9666666666666667