In [None]:
!pwd
!PYTHONPATH=$PYTHONPATH:/Users/ratlifflj/repos/teach/S22EE445-Dev/lecture-ntbks/utils/

## Module 2 - Lecture 3
### Least squares classifiers

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as la
from numpy.polynomial.polynomial import polyvander
from numpy.polynomial.polynomial import polyval
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
import pickle as pk
import pandas as pd
import seaborn as sns

# put the cf_matrix.py file in the current directory or change your pythonpath (as above) to 
# have the location where you stored cf_matrix.py
# from cf_matrix import make_confusion_matrix as cfmat
sns.set_theme(style="whitegrid")

fs=24
lw=4

rms = lambda x: np.sqrt(np.mean(np.square(x))) 

%load_ext autoreload
%autoreload 2

### Least Squares Classifier with Iris Data
The iris data set is a standard data set used in basic ML tasks. This data sets consists of three different types of irises'---i.e., Setosa, Versicolour, and Virginica---whose petal and sepal length are stored in a $150\times 4$ `numpy.ndarray`.

The rows are the samples and the columns are `sepal length`, `sepal width`, `petal length` and `petal width`, respectively.

You can actually load this data set directly from `scikit-learn` or from the file on the course website. 

`from sklearn import datasets`

`iris = datasets.load_iris()`



See [here](https://en.wikipedia.org/wiki/Iris_flower_data_set) and [here](https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html) for more information on this dataset.



#### Confusion Matrix
Before diving into the example, we need some functions to compute the error rates and confusion matrix we learned about in lecture.

In [5]:
numTP = lambda y,yhat: sum([1 for i in range(len(y)) if y[i] == True and yhat[i] == True])
numFN = lambda y,yhat: sum([1 for i in range(len(y)) if y[i] == True and yhat[i] == False])
numFP = lambda y,yhat: sum([1 for i in range(len(y)) if y[i] == False and yhat[i] == True])
numTN = lambda y,yhat: sum([1 for i in range(len(y)) if y[i] == False and yhat[i] == False]) 
confusion_matrix = lambda y,yhat: np.vstack([[numTP(y,yhat),numFN(y,yhat)],[numFP(y,yhat),numTN(y,yhat)]])
error_rate = lambda y,yhat: (numFN(y,yhat) + numFP(y,yhat)) / len(y)
error_rate2 = lambda y,yhat: np.average(y != yhat)

### Classification Example
We are going to try and learn a classifier for virginica based on the four features. Our classifier is fit using
$$A=\begin{bmatrix} \boldsymbol{1} & D\end{bmatrix}\in \mathbb{R}^{5\times 150}$$
where $D$ is the iris data containing the four features. 

The $y\in \mathbb{R}^{150}$ vector contains $100$ entries with the value `False` and $50$ entries with the value `True`. To create this, we create a boolean vector using `y=np.hstack([np.full(50, False),np.full(50, False),np.full(50, True)])` and then we define $b=2\cdot v-1$. We then solve
$$\min_x \|Ax-b\|^2_2$$
and use
$$\hat{y}=\mathrm{sign}(A\hat{x})$$
as our classifier.

In [6]:
irisD_=pk.load(open('../data/iris_data.p','rb'))
iris_=np.vstack([irisD_["setosa"],irisD_["versicolor"],irisD_["virginica"]])
la.norm(iris_-iris)

FileNotFoundError: [Errno 2] No such file or directory: '../data/iris_data.p'

In [10]:
#irisD=pk.load(open('./data/iris_data.p','rb'))
#iris=np.vstack([irisD["setosa"],irisD["versicolor"],irisD["virginica"]])
# or from scikit-learn
irisD = datasets.load_iris()

#f can be made via a regression model, and the > comparator
f_basis = lambda x: x@beta + v
f = lambda x: f_basis(x) > 0


iris = irisD['data'] 
print("shape of iris array : ", np.shape(iris))

# create the y vector
# we put false for the first two types of iris and true for the third type (namely virginica)
# i.e. if k == virginica: y[k] = True (1) ; else False (0)
y = np.hstack([np.full(50, False),np.full(50, False),np.full(50, True)])
b=2*y-1

# Create the A matrix by stacking ones with the iris array
A = np.hstack([np.ones((150,1)), iris])
print(iris)

## Solve least squares
theta = la.lstsq(A,b,rcond=None)[0]

## get classification
yhat = np.matmul(A,theta) > 0 #regression classifier 

## Create Confusion Matrix and plot it
# C = confusion_matrix(y,yhat)
# cfmat(C, group_names=['TP', 'FN', 'FP', 'TN'],categories=['+1','-1'])
print("error rate : {:.2f}%".format(error_rate(y,yhat)*100))

plt.savefig('../figs/cfmatrix_iris.png')

shape of iris array :  (150, 4)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]

FileNotFoundError: [Errno 2] No such file or directory: '../figs/cfmatrix_iris.png'

<Figure size 432x288 with 0 Axes>