In [None]:
# You might want to use the following packages
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def make_xor(n_points):
    centers = np.array([[0,0],[1,1]])
    labels = np.array([0,1])
    data = np.array([]).reshape(-1,3)
    for center, label in zip(centers,labels):
        points = np.random.normal(loc=center,scale=0.3,size=(n_points//4,2))
        points_labels = np.hstack((points,label*np.ones(n_points//4).reshape((-1, 1))))
        data = np.vstack((data,points_labels))
    return (data[:,[0,1]],data[:,2])


X, y = make_xor(1000)
y=y.astype(np.int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=49)
print(X_train.shape)
print(X_test.shape)
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral)

$f = \sigma(WX + b)$

- $X$: data matrix of a dimension $d \times N$, where $d$ is number of attributes and $N$ is number of samples
- $W$, $b$: weight ($k \times d$) and bias ($k \times 1$) of the model (aka model parameters). $k \times N$ is the dimension of the output $f$ of the perceptron. The bias column is broadcasted to each column of $WX$.
- $\sigma$: activation function (=step function in perceptron)


**Goal:** find the optimal $W$ and $b$ that minimize

$\mathcal{L}_\text{MSE} := \mathbb{E}\left( \|f - y\|^2 \right)$

**Strategy:**
1.   Begin with some random initial values of $W$, $b$
2.   Compute $\frac{\partial \mathcal{L}}{\partial W}$ and $\frac{\partial \mathcal{L}}{\partial b}$.
3.   Update $W$, $b$ using gradient decent
4.   Repeat 2~3 for a user-specified number of epochs

**Gradients:**

$\frac{\partial \mathcal{L}}{\partial W} = \frac{1}{N} (f-y) X^\top$

$\frac{\partial \mathcal{L}}{\partial b} = \frac{1}{N} \sum_{j = 1}^N (f_{:,j}-y_{:,j})$  

In [None]:
from sklearn.base import BaseEstimator
import numpy as np

class MyPerceptron(BaseEstimator):
  def __init__(self, d_in, d_out): # d_in means dimension of input
    self.W = np.random.normal(loc=0, scale=0.01, size=(d_out, d_in))
    self.b = np.random.normal(loc=0, scale=0.01, size=(d_out, 1))

  def fit(self, X, y, epochs, learning_rate):
    for i in range(epochs):
      f = self.forward_pass(X) # f is a prediction
      dW, db = self.backward_pass(X, y, f) # dW is dL/dW, the partial derivative of loss function with respect to weight matrix W
      self.W -= learning_rate*dW
      self.b -= learning_rate*db

      if i % 10 == 0:
        print('Epoch %d/%d: loss %f - accuracy %f'
              %(i, epochs, self.loss(y, f), self.evaluate(X, y)))

  def forward_pass(self, X):
    Z = np.matmul(self.W, X) + self.b
    f = np.where(Z>=0, 1, 0)
    return f

  def backward_pass(self, X, y, f):
    m = y.shape[0]
    df = f - y
    dW = np.dot(df, X.T)/m
    db = np.sum(df, axis=1, keepdims=True)/m
    return dW, db

  def loss(self, y, f):
    m = y.shape[0]
    L = np.sum( (f-y)**2 )/m
    return L

  def predict(self, X):
    f = self.forward_pass(X)
    y_hat = np.squeeze(f)
    return y_hat

  def evaluate(self, X, y):
    N = y.shape[0]
    y_hat = self.predict(X)
    correct_y = (y_hat == y).astype(int)

    return sum(correct_y)/N

In [None]:
model = MyPerceptron(2, 1)
model.fit(X_train.T, y_train, epochs=50, learning_rate=0.01)
print("Test Accuracy:", model.evaluate(X_test.T, y_test))