In [2]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix as confusion_matrix_skl

from dataset import iris_data

# 14.1 Classification

- data fitting with outcome that takes on (non-numerical) values like
    - true or false
    - spam or not spam
    - dog, horse, or mouse
- outcome values are called *labels* or *categories*

*Boolean* or *2-way* classification: outcomes as +1 (true) and −1 (false)
- model or *classifier*: $\hat{y} = \hat{f}(x),\ \text{where}\ \hat{f} : \mathbb{R}^n \rightarrow \{−1, +1\}$


## Boolean values

In [3]:
tf2pm1 = lambda b: 2*b-1
b = True
tf2pm1(b)

1

In [4]:
b = np.array([True, False, True])
tf2pm1(b)

array([ 1, -1,  1])

## Prediction errors

- data point (x,y), predicted outcome $\hat{y} = \hat{f}(x)$
- only four possibilities:
    - True positive. $y = +1$ and $\hat{y} = +1$.
    - True negative. $y = −1$ and $\hat{y} = −1$.
        (in these two cases, the prediction is correct)
    - False positive or type I error. $y = −1$ and $\hat{y} = +1$.
    - False negative or type II error. $y = +1$ and $\hat{y} = −1$.
        (in these two cases, the prediction is wrong)

## Confusion matrix or contingency table

- given data set $x(1), \ldots, x(N)$, $y(1), \ldots, y(N)$ and classifier $\hat{f}$
- count each of the four outcomes

||$\hat{y} = +1$|$\hat{y} = −1$|Total|
|:---:|:---:|:---:|:---:|
|y = +1| $N_{tp}$ | $N_{fn}$ | $N_p$ |
|y = −1| $N_{fp}$ | $N_{tn}$ | $N_n$ |
|All| $N_{tp} + N_{fp}$ | $N_{fn} + N_{tp}$ | $N$ |

- off-diagonal terms are prediction errors
- many error rates and accuracy measures are used
    - error rate is $(N_{fp} + N_{fn})/N$
    - true positive (or recall) rate is $N_{tp}/N_{p}$
    - false positive rate (or false alarm rate) is $N_{fp}/N_n$ \
    Find more metrics here: https://en.wikipedia.org/wiki/Confusion_matrix
- a proposed classifier is judged by its error rate(s) on a test set

In [5]:
# Count errors and correct predictions
Ntp = lambda y, yhat: sum((y == True) & (yhat == True))
Nfn = lambda y, yhat: sum((y == True) & (yhat == False))
Nfp = lambda y, yhat: sum((y == False) & (yhat == True))
Ntn = lambda y, yhat: sum((y == False) & (yhat == False))

error_rate = lambda y, yhat: (Nfn(y, yhat) + Nfp(y,yhat)) / len(y)
error_rate_compact = lambda y, yhat: np.average(y != yhat)

confusion_matrix = lambda y, yhat: np.block([[Ntp(y,yhat), Nfn(y,yhat)], \
                                             [Nfp(y,yhat), Ntn(y,yhat)]])

y = np.random.randint(2, size=100)
yhat = np.random.randint(2, size = 100)

confusion_matrix(y, yhat)

array([[39, 18],
       [24, 19]])

In [6]:
# sklearn.metrics.confusion_matrix
# [[Ntn, Nfp
#   Nfn, Ntp]]
confusion_matrix_skl(y, yhat)

array([[19, 24],
       [18, 39]], dtype=int64)

In [7]:
print(error_rate(y, yhat))
print(error_rate_compact(y, yhat))

0.42
0.42


# 14.2 Least squares classifier

- fit model $\tilde{f}$ to encoded $(\pm 1) y(i)$ values using standard least squares data fitting
- $\tilde{f}(x)$ should be near $+1$ when $y = +1$, and near $−1$ when $y = −1$
- $\tilde{f}(x)$ is a number
- use model $\hat{f}(x) = sign(\tilde{f}(x))$
- (size of $\tilde{f}(x)$ is related to the 'confidence' in the prediction)

In [8]:
# Regression model
ftilde = lambda x: x @ beta + v
# Regression classifier
fhat = lambda x: ftilde(x) > 0

## Iris flower classification

In [9]:
D = iris_data()

# Create 150 by 4 data matrix
iris = np.vstack([D['setosa'], D['versicolor'], D['virginica']])
# y[k] is true (1) if virginica, false (-1) otherwise
y = np.concatenate([np.zeros(100), np.ones(50)])
A = np.column_stack([np.ones(150), iris])

theta = np.linalg.lstsq(A, 2*y-1, rcond=None)[0]
theta

array([-2.39056373, -0.09175217,  0.40553677,  0.00797582,  1.10355865])

In [10]:
yhat = A @ theta > 0
C = confusion_matrix(y, yhat)
C

array([[46,  4],
       [ 7, 93]])

In [11]:
error_rate(y,yhat)

0.07333333333333333

In [12]:
np.average(y != yhat)  # error_rate_compact

0.07333333333333333

# 14.3 Multi-class classifiers

- K > 2 possible labels, with label set {1,. . . ,K}
- predictor is $\hat{f} : \mathbb{R}^n \rightarrow \{1, \ldots, K\}$
- for given predictor and data set, confusion matrix is K × K
- some off-diagonal entries may be much worse than others

## Multi-class error rate and confusion matrix

In [13]:
error_rate = lambda y, yhat: np.average(y != yhat)
def confusion_matrix(y, yhat, K):
    C = np.zeros((K,K))
    for i in range(K):
        for j in range(K):
            C[i,j] = sum((y == i+1) & (yhat == j+1))
    return C

# test for K = 4 on random vectors of length 100
K = 4
y = np.random.randint(1, K+1, size=100)
yhat = np.random.randint(1, K+1, size=100)
C = confusion_matrix(y, yhat, K)
print(C)

[[9. 7. 9. 7.]
 [3. 9. 6. 5.]
 [5. 8. 7. 5.]
 [6. 8. 0. 6.]]


In [14]:
error_rate(y,yhat), 1 - sum(np.diag(C))/np.sum(C)

(0.69, 0.69)

## Least squares multi-class classifier

- create a least squares classifier for each label versus the others (*one-versus-others* or *one-versus-all* classifier)
- take as classifier
$$
\hat{f}(x) = \underset{k={1,...,K}}{\text{argmax}} \in \tilde{f}_k(x)
$$
    where $\tilde{f}_k = x^T\theta_k$ is the least squares regression model for label $k$ against the others.
    The notation argmax means the index of the largest value among the numbers $\tilde{f}_k(x)$, for k = 1, . . . , K. (i.e., choose label k with largest value of $\tilde{f}_k(x)$)

- The n-vector $\theta_1, \ldots, \theta_K$ are the coefficients or parameters in the model. We can express this in matrix-vector notation as
$$
\hat{f}(x) = \text{argmax}(x^T \Theta),
$$
    where $\Theta = [\theta_1 \ldots \theta_K]$ is the n × K matrix of model coefficients, and the argmax of a row vector has the obvious meaning.


- for example, with

$$
\tilde{f}_1(x) = −0.7, \quad
\tilde{f}_2(x) = +0.2, \quad
\tilde{f}_3(x) = +0.8
$$

we choose $\hat{f}(x) = 3$

In [49]:
row_argmax = lambda u: np.array([np.argmax(u[i,:]) for i in range(len(u))])
A = np.random.normal(size=(4, 5))
A

array([[-0.32728205,  0.74927263,  0.85829598, -0.26169784, -0.57292648],
       [-0.28567832, -1.35986903, -0.12195252, -0.0160003 , -0.21255611],
       [-0.14816482,  0.60836306, -0.10287327,  0.68597006, -0.88790113],
       [ 0.47383631, -0.43889257,  0.58151237,  0.24587018, -0.40126887]])

In [50]:
row_argmax(A)

array([2, 3, 3, 2], dtype=int64)

If a data set with N examples is stored as an n × N data matrix $X$, and Theta is an n × K matrix with the coefficient vectors $\theta_k$, as its columns, then we can now define a function

In [51]:
fhat = lambda X, Theta: 1 + row_argmax(X.T @ Theta)

to find the N-vector of predictions.

## Matrix least squares

- use least squares to find the coefficient matrix $\Theta$ for a multi-class classifier with n features and K classes, from a data set of N examples.
-  assume the data is given as an n × N matrix $X$ and an N-vector $y^{cl}$ with entries in {1, . . . , K} that give the classes of the examples.
- least squares objective can be expressed as a matrix norm squared,
$$
\| X^T \Theta - Y \|^2
$$
where $Y$ is the N × K vector with
$$
Y_{ij} = \begin{cases}
1 & y^{cl}_i = j \\
−1 & y^{cl}_i \ne j
\end{cases}
$$
- the rows of $Y$ describe the classes using one-hot encoding, converted from 0/1 to −1/ + 1 values
- least squares solution is given by $\Theta = (X^T)^\dagger Y$

In [21]:
def one_hot(ycl, K):
    N = len(ycl)
    Y = np.zeros((N, K))
    for j in range(K):
        Y[np.where(ycl == j), j] = 1
    return Y
K = 4
ycl = np.random.randint(K, size = 6)
print(ycl)
Y = one_hot(ycl ,K)
Y

[3 2 0 2 1 1]


array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [24]:
2*Y - 1

array([[-1., -1., -1.,  1.],
       [-1., -1.,  1., -1.],
       [ 1., -1., -1., -1.],
       [-1., -1.,  1., -1.],
       [-1.,  1., -1., -1.],
       [-1.,  1., -1., -1.]])

In [83]:
def ls_multiclass(X, ycl, K):
    n, N = X.shape
    Theta = np.linalg.lstsq(X.T, 2*one_hot(ycl - 1, K) - 1, rcond=None)[0]
    yhat = 1 + row_argmax(X.T @ Theta)
    return Theta, yhat

## Iris flower classification

In [93]:
D = iris_data()
setosa = np.array(D['setosa'])
versicolor = np.array(D['versicolor'])
virginica = np.array(D['virginica'])
print(setosa.shape)  # (number of samples, number of features)

(50, 4)


In [94]:
# pick three random permutations of 1,...,50
I1 = np.random.permutation(50)
I2 = np.random.permutation(50)
I3 = np.random.permutation(50)
print(I1)

[ 2 18 48 37 16 33 40 39 27 30 19  3 25 22  1 23 13 10 15 21  6 35 26  0
  5 11 12  9 43 20 38 29 32 41 14 42 31 45 49 28 24 17 47 36 34 44 46  7
  4  8]


In [95]:
# training set is 40 randomly picked examples per class
Xtrain = np.vstack([setosa[I1[:40], :],
                    versicolor[I2[:40], :],
                    virginica[I3[:40], :]]).T
# add contant feature one
Xtrain = np.vstack([np.ones(120), Xtrain])
print(Xtrain.shape)  # (number of features, number of samples)

(5, 120)


In [96]:
# the true labels for train set are a sequence of 1s, 2s and 3s
# since the examples in Xtrain are stacked in order
ytrain = np.hstack([np.ones(40), 2*np.ones(40), 3*np.ones(40)])
print(ytrain.shape)  # number of samples

(120,)


In [97]:
# test set is remaining 10 examples for each class
Xtest = np.vstack([setosa[I1[40:], :],
                   versicolor[I2[40:], :],
                   virginica[I3[40:], :]]).T
Xtest = np.vstack([np.ones(30), Xtest])
ytest = np.hstack([np.ones(10), 2*np.ones(10), 3*np.ones(10)])

In [98]:
Theta, yhat = ls_multiclass(Xtrain, ytrain, 3)
print(yhat)
print(ytrain)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 3 2 2 3 3
 3 2 3 2 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3
 3 3 3 3 2 3 2 3 3]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]


In [99]:
Ctrain = confusion_matrix(ytrain, yhat, 3)
print(Ctrain)

[[40.  0.  0.]
 [ 0. 30. 10.]
 [ 0.  6. 34.]]


In [100]:
error_train = error_rate(ytrain, yhat)
print(error_train)

0.13333333333333333


In [103]:
yhat = row_argmax(Xtest.T @ Theta) + 1
print(yhat)
print(ytest)
Ctest = confusion_matrix(ytest, yhat, 3)
print(Ctest)

[1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 2 2 3 2 3 3 3 3 3 2 3 2]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 3. 3. 3.
 3. 3. 3. 3. 3. 3.]
[[10.  0.  0.]
 [ 0.  8.  2.]
 [ 0.  3.  7.]]


In [104]:
error_test = error_rate(ytest, yhat)
print(error_test)

0.16666666666666666
