In [1]:
#import basic libraries and setup dataset path

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # image processing
#from PIL import Image # image processing
#import lodgepole.image_tools as lit # linear approximation of gamma correction KAGGLE WILL NOT IMPORT THIS
import os

path = '/kaggle/input' #the path of the directory that the dataset folder lies, change to run on a different machine

In [2]:
#randomly pick people of which there are least 50 pictures

import random

people = [] # list of randomly selected people of which there are least 50 pictures
while len(people)<10:
    r = random.randint(1, 4000)
    _, _, files = next(os.walk(path+'/11785-spring2021-hw2p2s1-face-classification/train_data/'+str(r)))
    file_count = len(files)
    if file_count>=50:
        people.append(r)

In [3]:
#get 50 images of each person picked and put them in the training array

from skimage import color
from skimage import io
# color_img = np.asarray(Image.open(img_filename)) / 255
# gray_img = lit.rgb2gray_approx(color_img) 
#KAGGLE WONT IMPORT lit, skimage used instead below

training = list() #contains all the images to be used (500). 50 continous of each person and in the order the people were picked
for p in people:
    c=0
    for dirname, _, filenames in os.walk(path+'/11785-spring2021-hw2p2s1-face-classification/train_data/'+str(p)):
        for filename in filenames:
            if c<50:
                img = color.rgb2gray(io.imread(path+'/11785-spring2021-hw2p2s1-face-classification/train_data/'+str(p)+'/'+filename))
                training.append(img.flatten())#images need to be flat (vectors and not arrays)
                c+=1

In [4]:
y_true = list() #indicates how training array was actually made, used for evaluation
for i in range(0,10):
    for j in range(0,50):
        y_true.append(i)

In [5]:
#PCA implementation

from sklearn.decomposition import PCA

pca_reduced=list()

for j in [25, 50, 100]:
    pca = PCA(n_components=j)
    pca_reduced.append(pca.fit_transform(training))

In [6]:
#autoencoder implementation

from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from numpy.random import seed

ac_reduced = list()

for i in [25, 50, 100]:
    sX = np.asarray(training)
    ncol = sX.shape[1]
    X_train=X_test=sX
    input_dim = Input(shape = (ncol, ))

    # DEFINE THE DIMENSION OF ENCODER ASSUMED i
    encoding_dim = i
    # DEFINE THE ENCODER LAYERS
    encoded1 = Dense(4096/4, activation = 'relu')(input_dim)
    encoded2 = Dense(encoding_dim, activation = 'relu')(encoded1)
    # DEFINE THE DECODER LAYERS
    decoded1 = Dense(4096/4, activation = 'relu')(encoded2)
    decoded2 = Dense(ncol, activation = 'sigmoid')(decoded1)
    # COMBINE ENCODER AND DECODER INTO AN AUTOENCODER MODEL
    autoencoder = Model(inputs = input_dim, outputs = decoded2)
    # CONFIGURE AND TRAIN THE AUTOENCODER
    autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')
    # Train Auto Encoder
    autoencoder.fit(X_train, X_train, epochs = 10, batch_size = 10, shuffle = True, validation_data = (X_test, X_test))
    # Use Encoder level to reduce dimension of train and test data
    encoder = Model(inputs = input_dim, outputs = encoded2)
    # Predict the new data using Encoder
    encoded_out = encoder.predict(X_test)
    
    ac_reduced.append(encoded_out)

In [7]:
#purity function implementation

from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

#f1 function implementation

def f1_measure(y_true , y_pred):
    return metrics.f1_score(y_true, y_pred, average='micro')


In [8]:
#Agglomerative Clustering implementation

from sklearn.cluster import AgglomerativeClustering

AC = AgglomerativeClustering(n_clusters = 10, affinity = 'euclidean', linkage = 'ward')

print('Agglomerative Hierarchical Clustering')

print('PCA')
for i,j in zip(pca_reduced, [25,50,100]):
    labels = AC.fit_predict(i)
    purity = purity_score(y_true,labels)
    f1 = f1_measure(y_true,labels)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))
    
print('Autoencoder')
for i,j in zip(ac_reduced, [25,50,100]):
    labels = AC.fit_predict(i)
    purity = purity_score(y_true,labels)
    f1 = f1_measure(y_true, labels)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))

In [9]:
from scipy import spatial 
import math
K=10 # number of classes


def euclidean_dist(value1, value2):
    return np.linalg.norm(value1 - value2)


def cosine_dist(value1, value2):
    return spatial.distance.cosine(value1, value2)


def K_means(X, function, max_iterations):
    clusters = [[] for _ in range(K)] # we need as many clusters as the classes
    centroids = []
    
    # first we need to initialize the centroids randomly
    for j in range(K):
        index = random.randint(0,len(X)-1)
        centroids.append(X[index])
   
    # For each iteration
    iteration = 0
    while(iteration<max_iterations):
        #print("Iteration " + str(iteration))
        # Creating clusters
        # for each x value we need to compute the distance with each centroid
        for x_id, x in enumerate(X):
            if(function == "euclidean"):
                distances = [euclidean_dist(x, centroids[j]) for j in range(K)]
            else: # cosine distance
                distances = [cosine_dist(x, centroids[j]) for j in range(K)]
            q = np.argmin(distances)
            clusters[q].append(x_id)
        
        # Now we have our clusters ready and we need to compute the new centroids
        prev_centroids = centroids
        centroids = [np.mean(cluster, axis = 0) for cluster in clusters]        
        # We check if the centroids are not altered
        if(function == "euclidean"):
            differences = [euclidean_dist(prev_centroids[j], centroids[j]) for j in range(K)]
        else:
            differences = [cosine_dist(prev_centroids[j], centroids[j]) for j in range(K)]

        sum_dif = sum(differences)
        #print("Sum of differences: " + str(sum_dif))
        if sum_dif == 0:
            break
        
        iteration = iteration + 1

    # Now that clusters and centroids are created we will create y_pred
    y_pred = [[] for _ in range(len(X))]
    for j, cluster in enumerate(clusters):
        for i in cluster:
            y_pred[i] = j
    return y_pred

print("PCA \n")
# K_means with PCA 
print("Euclidean distance")
for i,j in zip(pca_reduced, [25,50,100]):
    y_pred = K_means(i,"euclidean",100)
    purity = purity_score(y_true,y_pred)
    f1 = f1_measure(y_true,y_pred)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))
    
print("\nCosine distance")
for i,j in zip(pca_reduced, [25,50,100]):
    y_pred = K_means(i,"cosine",100)
    purity = purity_score(y_true,y_pred)
    f1 = f1_measure(y_true,y_pred)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))
    
print("\n----------------------------------------------------------------")

# K_means with Autoencoder 
# Eucleidan distance
print("Autoencoder \n")
print("Euclidean distance")
for i,j in zip(pca_reduced, [25,50,100]):
    y_pred = K_means(i,"euclidean",100)
    purity = purity_score(y_true,y_pred)
    f1 = f1_measure(y_true,y_pred)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))

print("\nCosine distance")
for i,j in zip(pca_reduced, [25,50,100]):
    y_pred = K_means(i,"cosine",100)
    purity = purity_score(y_true,y_pred)
    f1 = f1_measure(y_true,y_pred)
    print('M='+str(j)+' purity='+str(purity)+' f1='+str(f1))