### Clustering using deep feature extraction

In [1]:
import numpy as np
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input as prpc_vgg_res
from keras.applications.xception import preprocess_input as prpc_xce_inc

from keras.models import Model
import keras.applications.vgg16 as vgg16
import keras.applications.vgg19 as vgg19
import keras.applications.resnet50 as res
import keras.applications.inception_v3 as inc
import keras.applications.xception as xce

import pickle
from glob import glob
import gc
import os

# This is the feature extraction module/class
class Feature_extractor:
	def __init__(self, dataset, cnn_architecture, layer):
		
		self.dataset_path = "D:/clus_data/Data/%s/" % dataset
		self.dataset_im_path = self.dataset_path + "Images/"
		print(self.dataset_im_path)
		self.dataset_feat_path = self.dataset_path + "Features/%s_%s" % (cnn_architecture, layer)

		ext_file = open( self.dataset_path + "extension.txt", "r")
		self.extension = ext_file.read()
		ext_file.close()
		
		self.n_files = len(glob(self.dataset_im_path + "*" + self.extension))
		print(self.n_files)
		self.network_name = cnn_architecture
		self.layer_name   = layer
		
		self.get_network_characteristics()
		print("Feature extractor: %s // %s" % (self.network_name, self.layer_name))

	def get_network_characteristics(self):
		if self.network_name == "vgg16":
			base_model = vgg16.VGG16(weights='imagenet')
			self.model = Model(inputs=base_model.input, outputs=base_model.get_layer(self.layer_name).output)
			self.tgt_size = (224, 224)
			self.prpc_fct = prpc_vgg_res
			
		elif self.network_name == "vgg19":
			base_model = vgg19.VGG19(weights='imagenet')
			self.model = Model(inputs=base_model.input, outputs=base_model.get_layer(self.layer_name).output)        
			self.tgt_size = (224, 224)
			self.prpc_fct = prpc_vgg_res
			
		elif self.network_name == "resnet":
			base_model = res.ResNet50(weights='imagenet')
			self.model = Model(inputs=base_model.input, outputs=base_model.get_layer(self.layer_name).output)        
			self.tgt_size = (224, 224)
			self.prpc_fct = prpc_vgg_res
			
		elif self.network_name == "xception":
			base_model = xce.Xception(weights='imagenet')
			self.model = Model(inputs=base_model.input, outputs=base_model.get_layer(self.layer_name).output)        
			self.tgt_size = (299, 299)
			self.prpc_fct = prpc_xce_inc
			
		elif self.network_name == "inception":
			base_model = inc.InceptionV3(weights='imagenet')
			self.model = Model(inputs=base_model.input, outputs=base_model.get_layer(self.layer_name).output)
			self.tgt_size = (299, 299)
			self.prpc_fct = prpc_xce_inc
			
		else:
			print("Error: possible network names:\n-'vgg16'\n-'vgg19'\n-'resnet'\n-'xception'\n-'inception'")
			
	def get_PIL_image(self, image_name):
		file_name = self.dataset_im_path + image_name + self.extension
		
		return image.load_img(file_name, target_size = self.tgt_size)
	
	def get_arr_image(self, image_name):
		pil_im = self.get_PIL_image(image_name)
	
		return image.img_to_array(pil_im)
	
	def get_prpc_image(self, image_name):
		arr_im = self.get_arr_image(image_name)
		arr_im = np.expand_dims(arr_im, axis = 0)
		
		return self.prpc_fct(arr_im)
	
	def extract(self, image_name):
		prpc_im = self.get_prpc_image(image_name)
		
		return np.ndarray.flatten(self.model.predict(prpc_im))

	def extract_and_save_features(self):
		if not os.path.exists(self.dataset_feat_path):
			os.makedirs(self.dataset_feat_path)
		else:
			return
		
		print("extracting and saving features ... ")
		
		gc.collect()
        
		for im in range(self.n_files):
			if im % 100 == 0:
				print("    %d/%d" % (im, self.n_files))
			features = self.extract(str(im))
			feat_file = open(self.dataset_feat_path + "/%s.p" % im, "wb")
			pickle.dump(features, feat_file)
			feat_file.close()

Using TensorFlow backend.


In [2]:
# utils for clustering evaluation

import numpy as np
from copy import deepcopy

def confusion_matrix(clusters, classes_gt):
	new_gt = deepcopy(classes_gt)
	l = list(set(classes_gt))
	for i in range(len(classes_gt)):
		for j in range(len(l)):
			if classes_gt[i] == l[j]:
				new_gt[i] = j
				
	conf_mat = np.zeros([len(set(clusters)), len(set(new_gt))])
	for i in range(len(clusters)):
		conf_mat[clusters[i], new_gt[i]] += 1

	return conf_mat

def purity(clusters, classes_gt):
	conf_mat = confusion_matrix(clusters, classes_gt)
	sum_clu  = np.max(conf_mat, axis = 1)
	sum_tot  = np.sum(sum_clu)

	pur = sum_tot / len(clusters)

	return pur

In [3]:
from glob import glob
import os
import pickle

import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, MiniBatchKMeans, MeanShift, AffinityPropagation, Birch, DBSCAN
from sklearn.metrics import normalized_mutual_info_score as nmi

# The clustering module
class Clusterer:
	def __init__(self, dataset, cnn_architecture, layer, clustering_algorithm, n_classes = 0):
		self.dataset_path = "D:/clus_data/Data/%s/" % dataset
		self.dataset_feat_path = self.dataset_path + "Features/%s_%s/" % (cnn_architecture, layer)

		self.n_files = len(glob(self.dataset_feat_path + "*.p"))
		
		self.get_n_classes(n_classes)
		print("Number of classes: %d" % self.n_classes)
		
		self.get_algorithm(clustering_algorithm)
		print("Algorithm: " + str(self.algorithm).split("(")[0])
		
		self.get_features()
		print("Features shape: " + str(self.features.shape))
			
	def get_n_classes(self, n_classes):
		if os.path.exists(self.dataset_path + "true_labels.txt"):
			true_lab_file = open(self.dataset_path + "true_labels.txt", "r")
			self.true_labels = [int(tl.rstrip("\n")) for tl in true_lab_file.readlines()]
			true_lab_file.close()

			self.n_classes = len(list(set(self.true_labels)))
		elif n_classes != 0:
			self.n_classes = n_classes
			self.true_labels = 0
		else:
			print("Error: %s folder must contain a true_labels.txt file OR n_classes must be a positive integer" % self.dataset_path)
			return
	
	def get_algorithm(self, clustering_algorithm):
		if clustering_algorithm == "kmeans":
			self.algorithm = KMeans(self.n_classes, precompute_distances=False)
		elif clustering_algorithm == "mb_kmeans":
			self.algorithm = MiniBatchKMeans(self.n_classes)
		elif clustering_algorithm == "affinity_prop":
			self.algorithm = AffinityPropagation()
		elif clustering_algorithm == "mean_shift":
			self.algorithm = MeanShift()
		elif clustering_algorithm == "agglomerative":
			self.algorithm = AgglomerativeClustering(self.n_classes)
		elif clustering_algorithm == "birch":
			self.algorithm = Birch(self.n_classes)
		elif clustering_algorithm == "dbscan":
			self.algorithm = DBSCAN()
		else:
			print("Error: This clustering algorithm is not available. Choose among the following options: 'kmeans', 'mb_kmeans', 'affinity_prop', 'mean_shift', 'agglomerative', 'birch', 'dbscan'")
		
	def get_features(self):
		self.features = []
		for i in range(self.n_files):
			file = open(self.dataset_feat_path + "%d.p" % i, "rb")
			self.features.append(pickle.load(file))
			file.close()
		self.features = np.array(self.features)
	
	def cluster(self):
		print("Clustering ...")
		self.predicted_labels = self.algorithm.fit_predict(self.features)
	
	def evaluate(self, metric):
		if self.true_labels == 0:
			print("Error: A true_labels.txt file is needed")
			return
		
		if metric == "nmi":
			print("NMI: %f" % nmi(self.true_labels, self.predicted_labels))
		elif metric == "purity":
			print("Purity: %f" % purity(self.true_labels, self.predicted_labels))
		elif metric == "both":
			print("NMI: %f" % nmi(self.true_labels, self.predicted_labels, average_method="arithmetic"))
			print("Purity: %f" % purity(self.true_labels, self.predicted_labels))
		else:
			print("Error: This metric is not available. Choose among the following options: 'nmi', 'purity', 'both'")

### Updated result: Inception and Xception features give an NMI of 51%, which is a significant improvement over 15% for VGG and 28% for Resnet50.

#### CIFAR-10 VGG16 MiniBatch K-means

In [6]:
# running on the cifar-10 dataset
dataset              = "cifar-10"
cnn_architecture     = "vgg16"
layer 				 = "fc2"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe = Feature_extractor(dataset, cnn_architecture, layer)
fe.extract_and_save_features()
cl = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl.cluster()
predicted_labels = cl.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
cl.evaluate(metric)

D:/clus_data/Data/cifar-10/Images/
60000
Feature extractor: vgg16 // fc2
Number of classes: 10
Algorithm: MiniBatchKMeans
Features shape: (60000, 4096)
Clustering ...
Shape predicted labels: (60000,)
NMI: 0.158868
Purity: 0.294167


#### CIFAR-10 VGG-19 MiniBatch K-Means

In [5]:
# running on the cifar-10 dataset
dataset              = "cifar-10"
cnn_architecture     = "vgg19"
layer 				 = "fc2"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe = Feature_extractor(dataset, cnn_architecture, layer)
fe.extract_and_save_features()
cl = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl.cluster()
predicted_labels = cl.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
print("------------------")
print("------------------")
print("Evaluation Results")
print("------------------")
cl.evaluate(metric)

D:/clus_data/Data/cifar-10/Images/
60000
Feature extractor: vgg19 // fc2
Number of classes: 10
Algorithm: MiniBatchKMeans
Features shape: (60000, 4096)
Clustering ...
Shape predicted labels: (60000,)
------------------
------------------
Evaluation Results
------------------
NMI: 0.143904
Purity: 0.297800


#### CIFAR-10 Resnet50 MiniBatch K-Means

In [7]:
# running on the cifar-10 dataset
dataset              = "cifar-10"
cnn_architecture     = "resnet"
layer 				 = "avg_pool"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe = Feature_extractor(dataset, cnn_architecture, layer)
fe.extract_and_save_features()
cl = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl.cluster()
predicted_labels = cl.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
print("------------------")
print("------------------")
print("Evaluation Results")
print("------------------")
cl.evaluate(metric)

D:/clus_data/Data/cifar-10/Images/
60000
Feature extractor: resnet // avg_pool
Number of classes: 10
Algorithm: MiniBatchKMeans
Features shape: (60000, 2048)
Clustering ...
Shape predicted labels: (60000,)
------------------
------------------
Evaluation Results
------------------
NMI: 0.268808
Purity: 0.409367


#### CIFAR-10 Xception MiniBatch K-Means

In [8]:
# running on the cifar-10 dataset
dataset              = "cifar-10"
cnn_architecture     = "xception"
layer 				 = "avg_pool"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe = Feature_extractor(dataset, cnn_architecture, layer)
fe.extract_and_save_features()
cl = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl.cluster()
predicted_labels = cl.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
print("------------------")
print("------------------")
print("Evaluation Results")
print("------------------")
cl.evaluate(metric)

D:/clus_data/Data/cifar-10/Images/
60000
Feature extractor: xception // avg_pool
Number of classes: 10
Algorithm: MiniBatchKMeans
Features shape: (60000, 2048)
Clustering ...
Shape predicted labels: (60000,)
------------------
------------------
Evaluation Results
------------------
NMI: 0.511780
Purity: 0.679817


#### CIFAR-10 Inception MiniBatch K-Means

In [9]:
# running on the cifar-10 dataset
dataset              = "cifar-10"
cnn_architecture     = "inception"
layer 				 = "avg_pool"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe = Feature_extractor(dataset, cnn_architecture, layer)
fe.extract_and_save_features()
cl = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl.cluster()
predicted_labels = cl.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
print("------------------")
print("------------------")
print("Evaluation Results")
print("------------------")
cl.evaluate(metric)

D:/clus_data/Data/cifar-10/Images/
60000
Feature extractor: inception // avg_pool
Number of classes: 10
Algorithm: MiniBatchKMeans
Features shape: (60000, 2048)
Clustering ...
Shape predicted labels: (60000,)
------------------
------------------
Evaluation Results
------------------
NMI: 0.518607
Purity: 0.662533


#### Running their modeule on COIL-100 dataset (which they used in their paper), the result is very good. NMI is 91%!

In [5]:
dataset              = "coil-100"
cnn_architecture     = "vgg16"
layer 				 = "fc2"
clustering_algorithm = "mb_kmeans"
metric				 = "both"

fe2 = Feature_extractor(dataset, cnn_architecture, layer)
fe2.extract_and_save_features()
cl2 = Clusterer(dataset, cnn_architecture, layer, clustering_algorithm)
cl2.cluster()
predicted_labels = cl2.predicted_labels
print("Shape predicted labels: %s" % str(predicted_labels.shape))
cl2.evaluate(metric)

D:/clus_data/Data/coil-100/Images/
7200
Feature extractor: vgg16 // fc2
Number of classes: 100
Algorithm: MiniBatchKMeans
Features shape: (7200, 4096)
Clustering ...
Shape predicted labels: (7200,)
NMI: 0.910459
Purity: 0.842083


### Below is my code, where I tried to replicate their results.

In [1]:
from keras.models import Model
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img, load_img

model = VGG16(weights='imagenet')
feat_extract_model = Model(inputs=model.input, outputs=model.get_layer('fc2').output)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


In [2]:
data_gen = ImageDataGenerator(preprocessing_function=preprocess_input)

test_gen = data_gen.flow_from_directory('D:/clus_data/Data/coil-100/Images/',target_size=(224,224), color_mode="rgb", class_mode=None, shuffle=False, batch_size=36)

Found 7200 images belonging to 1 classes.


In [3]:
extract_feats = feat_extract_model.predict_generator(test_gen, steps=200, verbose=True)



In [4]:
extract_feats.shape

(7200, 4096)

In [5]:
from sklearn.cluster import MiniBatchKMeans

predictions = MiniBatchKMeans(n_clusters=100).fit_predict(extract_feats)

In [6]:
import os
if os.path.exists("D:/clus_data/Data/coil-100/true_labels.txt"):
    true_lab_file = open("D:/clus_data/Data/coil-100/true_labels.txt", "r")
    true_labels = [int(tl.rstrip("\n")) for tl in true_lab_file.readlines()]
    true_lab_file.close()

#### My approach gives an NMI of 77%, compared to their 91%.

In [9]:
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score

print("Normalized Mutual Info: ", normalized_mutual_info_score(true_labels, predictions,average_method="arithmetic"))
print("Purity: ", purity(true_labels, predictions))

Normalized Mutual Info:  0.7732649628986448
Purity:  0.6275
