## Lecture 17: Feature Selection, $k$-means clustering

In [2]:
import numpy as np
import os

import matplotlib.pyplot as plt
from matplotlib import rc

plt.rcParams['xtick.labelsize']=16      # change the tick label size for x axis
plt.rcParams['ytick.labelsize']=16      # change the tick label size for x axis
plt.rcParams['axes.linewidth']=1        # change the line width of the axis
plt.rcParams['xtick.major.width'] = 3   # change the tick line width of x axis
plt.rcParams['ytick.major.width'] = 3   # change the tick line width of y axis
rc('text', usetex=False)                # disable LaTeX rendering in plots
rc('font',**{'family':'DejaVu Sans'})   # set the font of the plot to be DejaVu Sans

In [5]:
from scipy import io

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 5.1 Feature Selection and Data Mining

#### Dataset 2: dogs and cats
an image database of 80 dogs and 80 cats

The data for each cat and dog is the $64\times64$ pixel space of the image. Thus each image has $4096$ measurements, in contrast to the four measurements for each example in the iris data set.

The end goal is to select a finite set of features that can help us distinguish between a dog and a cat image.

In [9]:
path = "/content/drive/MyDrive/ME491"
data1_path = os.path.join(path, "data/fisheriris.mat")

dog_path = os.path.join(path, "data/dogData.mat")
cat_path = os.path.join(path, "data/catData.mat")
dogdata_mat = io.loadmat(dog_path)
catdata_mat = io.loadmat(cat_path)
dog = dogdata_mat['dog']
cat = catdata_mat['cat']

m = 64
n = 64

##### Feature detection

Now we have a general understanding of the data, we want to extract features for the dataset, and we will use PCA, similar to eigenfaces to find the dominant features.

In [10]:
# we are going to use the same set of coordinates to
# describe both dogs and cats
DC = np.concatenate((dog, cat), axis = 1)

# PCA
avgAnimal = np.mean(DC, axis = 1)
X = DC - np.tile(avgAnimal, (DC.shape[1], 1)).T
U, S, VT = np.linalg.svd(X, full_matrices = False)

In [None]:
# Let's look at the average animal
plt.imshow(np.reshape(avgAnimal, (m,n)).T, cmap="Greys_r")
plt.axis('off')

In [None]:
# Now Let's plot the first 10 animal features

i = 2
j = 5

eigenanimal = np.zeros((n*i, m*j))
count = 0

for ii in range(i):
  for jj in range(j):
    eigenanimal[ii*n:(ii+1)*n, jj*m:(jj+1)*m] = np.reshape(U[:,count],(m,n)).T
    count += 1

img = plt.imshow(eigenanimal, vmin = -1e-2, vmax = 1e-2, cmap="Greys_r")
plt.axis('off')

In [13]:
dog_w_path = os.path.join(path, "data/dogData_w.mat")
cat_w_path = os.path.join(path, "data/catData_w.mat")
dogwdata_mat = io.loadmat(dog_w_path)
catwdata_mat = io.loadmat(cat_w_path)
dog_w = dogwdata_mat['dog_wave']
cat_w = catwdata_mat['cat_wave']

In [None]:
# Now we want to plot the first 36 dogs
n = 32
m = 32
alldogs_w = np.zeros((n*6,m*6))
count = 0

for j in range(6):
  for k in range(6):
    alldogs_w[j*n:(j+1)*n, k*m:(k+1)*m] = np.reshape(dog_w[:,count],(m,n)).T
    count += 1

img = plt.imshow(alldogs_w)
img.set_cmap('gray')
plt.axis('off')

In [15]:
DC_w = np.concatenate((dog_w, cat_w), axis = 1)

# PCA
avgAnimal_w = np.mean(DC_w, axis = 1)
Xw = DC_w - np.tile(avgAnimal_w, (DC_w.shape[1],1)).T
Uw, Sw, VTw = np.linalg.svd(Xw, full_matrices = False)

In [None]:
# Let's look at the average animal
plt.imshow(np.reshape(avgAnimal_w, (m,n)).T, cmap="Greys_r")
plt.axis('off')

In [None]:
# Now Let's plot the first 10 animal features

i = 2
j = 5

eigenanimal_w = np.zeros((n*i, m*j))
count = 0

for ii in range(i):
  for jj in range(j):
    eigenanimal_w[ii*n:(ii+1)*n, jj*m:(jj+1)*m] = np.reshape(Uw[:,count],(m,n)).T
    count += 1

img = plt.imshow(eigenanimal_w, vmin = -1e-2, vmax = 1e-2, cmap="coolwarm")
plt.axis('off')

When we were discussing PCA in Ch. 1, we never spent too much time discussing the meaning of the $V$ matrix.  Here, we will use $V$ matrix to perform feature engineering.

The importance of each feature to an individual image is given by the $V$ matrix in the SVD. Specifically, each column of $V$ determines the loading, or weighting, of each feature onto a specific image.

We can now look at the distributions for the $V$ matrix for dogs and cats.

In [None]:
xbin = np.linspace(-0.25, 0.25, 20)
xbin_edges = np.append(xbin, xbin[-1]+(xbin[1]-xbin[0])) - (xbin[1]-xbin[0])/2

fig, axs = plt.subplots(4,2)
fig.tight_layout(h_pad=0, w_pad=2)
fig.set_size_inches(6, 8)
for j in range(4):
  pdf1 = np.histogram(VT[j,:80], bins=xbin_edges)[0]
  pdf2 = np.histogram(VT[j,80:], bins=xbin_edges)[0]
  axs[j,0].plot(xbin, pdf1, label = "dogs")
  axs[j,0].plot(xbin, pdf2, label = "cats")
  axs[j,0].legend()
  axs[j,0].set_ylabel('PCA'+str(j+1), fontsize = 18)

  pdf1 = np.histogram(VTw[j,:80], bins=xbin_edges)[0]
  pdf2 = np.histogram(VTw[j,80:], bins=xbin_edges)[0]
  axs[j,1].plot(xbin, pdf1, label = "dogs")
  axs[j,1].plot(xbin, pdf2, label = "cats")
  axs[j,1].legend()

axs[0,0].set_title("image space", fontsize = 18)
axs[0,1].set_title("wavelet space", fontsize = 18)

All dog and cat images projecting to the first three PCA coordinates.

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(211, projection='3d')
ax1.scatter(VT[0,:80],VT[1,:80],VT[2,:80],c='r',marker='o',s=20)
ax1.scatter(VT[0,80:],VT[1,80:],VT[2,80:],c='b',marker='o',s=20)

ax2 = fig.add_subplot(212, projection='3d')
ax2.scatter(VTw[0,:80],VTw[1,:80],VTw[2,:80],c='r',marker='o',s=20)
ax2.scatter(VTw[0,80:],VTw[1,80:],VTw[2,80:],c='b',marker='o',s=20)

plt.show()

### 5.3 Unsupervised Learning: $k$-Means Clustering

`scikit-learn` has a built-in function to perform $k$-means clustering.  We will first code our own version of $k$-means to observe how the algorithm behaves for a few iterations, and then learn how to use the `scikit-learn` version.

The iteration step for $k$-Means is very straightforward, and the hard part is coming up with a iteration stopping criteria.  We will not explore that part in class but rather use the standard funtions in Python.

#### 1. Lloyd's algorithm

First, let's prepare a synthetic dataset.

In [22]:
# Training and testing set sizes
n1 = 100 # Train
n2 = 50  # Test

# Random ellipse 1 centered at (0,0)
x = np.random.randn(n1+n2)
y = 0.5*np.random.randn(n1+n2)

# Random ellipse 2 centered at (1,-2)
x2 = np.random.randn(n1+n2) + 1
y2 = 0.2*np.random.randn(n1+n2) - 2

# Rotate ellipse 2 by theta
theta = np.pi/4
A = np.zeros((2,2))
A[0,0] = np.cos(theta)
A[0,1] = -np.sin(theta)
A[1,0] = np.sin(theta)
A[1,1] = np.cos(theta)

x3 = A[0,0]*x2 + A[0,1]*y2
y3 = A[1,0]*x2 + A[1,1]*y2

In [None]:
plt.figure()
plt.plot(x, y, 'o')
plt.plot(x3, y3, 'o')
plt.show()

Now let's write our $k$-means iterations. We will apply the iteration 4 times.

In [None]:
# Here we will apply k-means to all datas
X1 = np.column_stack((x3, y3))
X2 = np.column_stack((x, y))

Y = np.concatenate((X1, X2))

g1 = np.array([-1, 0]) # Initial guess
g2 = np.array([1, 0])
fig, axs = plt.subplots(2,2)
axs = axs.reshape(-1)
for j in range(4):
  d1 = np.linalg.norm(Y - g1, ord = 2, axis = 1)
  d2 = np.linalg.norm(Y - g2, ord = 2, axis = 1)
  idx1 = np.where(d1 < d2)
  idx2 = np.where(d1 >= d2)
  class1 = Y[idx1[0],:]
  class2 = Y[idx2[0],:]

  axs[j].plot(class1[:,0], class1[:,1], 'o', ms=5)
  axs[j].plot(class2[:,0], class2[:,1], 'o', ms=5)
  axs[j].plot(g1[0], g1[1], 'k*', ms=10)
  axs[j].plot(g2[0], g2[1], 'k*', ms=10)

  g1 = np.array([np.mean(class1[:,0]), np.mean(class1[:,1])])
  g2 = np.array([np.mean(class2[:,0]), np.mean(class2[:,1])])

plt.show()

Now let's use `scikit-learn` to do $k$-means
documentation: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=0).fit(Y)
c = kmeans.cluster_centers_
ind = kmeans.labels_

In [None]:
idx1 = np.where(ind == 0)
idx2 = np.where(ind == 1)

print(Y.shape)

plt.figure()
plt.plot(Y[idx1[0], 0], Y[idx1[0], 1], 'o')
plt.plot(Y[idx2[0], 0], Y[idx2[0], 1], 'o')
plt.plot(c[:,0], c[:,1], 'k*', ms=10)
plt.show()