# Exercise on using unsupervised constructed features for pattern recognition

To get unsupervised contructed features of an image, we have used a pretrained CNN as feature extractor. For this purpose we pushed each image through a pretrained CNN and extracted the activations in the first fully connected layer. As pretrained CNN we use a VGG16 architecture that was trained on ImageNet data and was the second winner of the ImageNet competition in 2014. 

In this manner we have got unsupervised constructed features for 1000 images of the MNIST data set and 1000 images of the CIFAR10 data set. In both data sets we have 10 distinguished classes. The data sets are balanced meaning we have 100 images per class. To assess if the extracted features are good, we do a PCA and t-SNE visualization and check if we can observe 10 clusters corresponding to the 10 classes. As baseline benchmark we do the PCA and t-SNE with the raw image pixels. Then we also do a PCA and t-SNE with the VGG-features.

a) Go through the code which is used to produce for the MNIST data a 2D PCA and t-SNE plot using the raw pixel features. What differences can you see between both plots? Discuss your observations (e.g. with your neighbor)



b) Go through the code which is used to produce for the MNIST data a 2D PCA and t-SNE plot using the unsupervised constructed VGG features. Compare the 2D plots we get with raw pixel features and with VGG features and and discuss your observations (e.g. with your neighbor).


c) Go through the code which is used to produce for the CIFAR10 data a 2D plots based on the pixel features.  Complete the code needed to produce a t-SNE plot. Compare the 2D PCA plot and t-SNE plot and discuss your observations (e.g. with your neighbor).



d) Go through the code which is used to produce for the CIFAR10 data 2D plots based on the unsupervised constructed VGG features.  Complete the code needed to produce a PCA plot. Compare the 2D PCA plot and t-SNE plot and discuss your observations (e.g. with your neighbor).



### General imports

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as imgplot
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from pylab import *


import time
import tensorflow as tf
tf.set_random_seed(1)

import keras
import sys
print ("Keras {} TF {} Python {}".format(keras.__version__, tf.__version__, sys.version_info))

# MNIST

### Data preparation for MNIST

In [None]:
#downlad mnist data
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
del [x_test,y_test]

In [None]:
#loop over each class label and sample 100 random images over each label and save the idx to subset
np.random.seed(seed=486)
idx=np.empty(0,dtype="int8")
for i in range(0,len(np.unique(y_train))):
    idx=np.append(idx,np.random.choice(np.where((y_train[0:len(y_train)])==i)[0],100,replace=False))

x_train= x_train[idx]
y_train= y_train[idx]

In [None]:
print(x_train.shape)
print(y_train.shape)
print(np.unique(y_train,return_counts=True))

In [None]:
#sample image of each label
plt.figure(figsize=(20,20))
for i in range(0,len(np.unique(y_train))):
    rmd=np.random.choice(np.where(y_train==i)[0],1)
    plt.subplot(1,10,i+1)
    img=x_train[rmd].reshape(1,28,28,1)
    plt.imshow(img[0,:,:,0],cmap="gray")

##  Visualizing MNIST using raw image pixel features

### PCA on pixel values MNIST

In [None]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(x_train.reshape((len(x_train),28*28)))

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=pca.transform(x_train.reshape((len(x_train),28*28)))[:,0],y=pca.transform(x_train.reshape((len(x_train),28*28)))[:,1],c=y_train,s=35,cmap=cmap)
plt.title("PCA on raw pixelvalues mnist")
plt.colorbar()
plt.show()

### t-sne on pixel values MNIST

In [None]:
# takes some time
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
tsne = model.fit_transform(x_train.reshape((len(x_train),28*28)))

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=tsne[:,0],y=tsne[:,1],c=y_train,s=35,cmap=cmap)
plt.title("t-sne on raw pixelvalues mnist")
plt.colorbar()
plt.show()

##  Visualizing MNIST based on VGG features

### Getting VGG16 features for MNIST

In [None]:
# Downloading embeddings which have been extracted beforehand
import urllib
import os
if not os.path.isfile('Mnist_EMB_1000.npz'):
    urllib.request.urlretrieve(
    "https://www.dropbox.com/s/ejiu7ymoyn6kxp7/Mnist_EMB_1000.npz?dl=1",
    "Mnist_EMB_1000.npz")
%ls -hl Mnist_EMB_1000.npz
Data=np.load("Mnist_EMB_1000.npz")
vgg_features_mnist = Data["arr_0"]

### PCA on vgg16 features on MNIST

In [None]:
pca = PCA(n_components=2)
pca.fit(vgg_features_mnist)

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=pca.transform(vgg_features_mnist)[:,0],
            y=pca.transform(vgg_features_mnist)[:,1],
            c=y_train,s=35,cmap=cmap)
plt.title("PCA on VGG-features MNIST")
plt.colorbar()
plt.show()

### t-sne on vgg16 features mnist

In [None]:
model = TSNE(n_components=2, random_state=0)
tsne = model.fit_transform(vgg_features_mnist)

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=tsne[:,0],y=tsne[:,1],c=y_train,s=35,cmap=cmap)
plt.title("t-sne on vgg16 features mnist")
plt.colorbar()
plt.show()

## Cifar10

### Data preparation

In [None]:
#downlad cifar10 data
from keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
del [x_test,y_test]

In [None]:
#loop over each class label and sample 100 random images over each label and save the idx to subset
np.random.seed(seed=222)
idx=np.empty(0,dtype="int8")
for i in range(0,len(np.unique(y_train))):
    idx=np.append(idx,np.random.choice(np.where((y_train[0:len(y_train)])==i)[0],100,replace=False))

x_train= x_train[idx]
y_train= y_train[idx]

In [None]:
labels=np.array(["airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"])

In [None]:
print(x_train.shape)
print(y_train.shape)
print(np.unique(y_train,return_counts=True))

In [None]:
#sample image of each label
plt.figure(figsize=(20,20))
for i in range(0,len(np.unique(y_train))):
    rmd=np.random.choice(np.where(y_train==i)[0],1)
    plt.subplot(1,10,i+1)
    img=x_train[rmd]
    plt.imshow(img[0,:,:,:])
    plt.title(labels[i])

##  Visualizing CIFAR10 based on raw image pixel features

### PCA on pixel values cifar10

In [None]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(x_train.reshape((len(x_train),32*32*3)))

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=pca.transform(x_train.reshape((len(x_train),32*32*3)))[:,0],
            y=pca.transform(x_train.reshape((len(x_train),32*32*3)))[:,1],
            c=y_train.reshape(len(x_train)),s=35,cmap=cmap)
plt.title("PCA on raw pixelvalues cifar10")
plt.colorbar()
plt.show()

### t-sne on pixel values cifar10

In [None]:
#takes some time
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
tsne = model.fit_transform(x_train.reshape((len(x_train),32*32*3)))

In [None]:
#### doing the t-sne plot
#### your code here:
#########################



##  Visualizing CIFAR10 based VGG features

### Getting VGG-features for CIFAR10

In [None]:
# Downloading the data, if it does not exist
import urllib
import os
if not os.path.isfile('cifar_EMB_1000.npz'):
    urllib.request.urlretrieve(
    "https://www.dropbox.com/s/si287al91c1ls0d/cifar_EMB_1000.npz?dl=1",
    "cifar_EMB_1000.npz")
%ls -hl cifar_EMB_1000.npz
Data=np.load("cifar_EMB_1000.npz")
vgg_features_cifar = Data["arr_0"]

### PCA on vgg16 features cifar10

In [None]:
pca = PCA(n_components=2)
pca.fit(vgg_features_cifar)

In [None]:
#### doing the PCA plot
#### your code here:
#########################



### t-sne on vgg16 features cifar10

In [None]:
model = TSNE(n_components=2, random_state=0)
tsne = model.fit_transform(vgg_features_cifar)

In [None]:
plt.figure(figsize=(8,8))
cmap = cm.get_cmap('jet', 10)
plt.scatter(x=tsne[:,0],y=tsne[:,1],c=y_train.reshape(len(x_train)),s=35,cmap=cmap)
plt.title("t-sne on vgg16 features cifar10")
plt.colorbar()
plt.show()