In [51]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [60]:
path = r"C:\Users\Thenu\Desktop\Image_clustering\flower_images\flower_images"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
flowers = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.png'):
          # adds only the image files to the flowers list
            flowers.append(file.name)

In [61]:
# load the image as a 224x224 array
img = load_img(flowers[0], target_size=(224,224))
# convert from 'PIL.Image.Image' to numpy array
img = np.array(img)

print(img.shape)


(224, 224, 3)


In [62]:
reshaped_img = img.reshape(1,224,224,3)
print(reshaped_img.shape)

(1, 224, 224, 3)


In [63]:
x = preprocess_input(reshaped_img)

In [64]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [65]:
cwd = os.getcwd()


In [66]:
# load the model first and pass as an argument
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [67]:
data = {}

p=r"C:/Users/Thenu/Desktop/Image_clustering/flower_features.pkl"


# lop through each image in the dataset
for flower in flowers:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(flower,model)
        data[flower] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
          
 
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat.shape

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
feat.shape


# get the unique labels (from the flower_labels.csv)
df = pd.read_csv('flower_labels.csv')
label = df['label'].tolist()
unique_labels = list(set(label))


In [68]:
pca = PCA(n_components=20, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

In [69]:
x.shape

(210, 20)

In [70]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)

In [71]:
# holds the cluster id and the images { id: [images] }
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [72]:
import shutil
for key in groups:
    if key!=-1:
        for name in groups[key]:
            original = r"C:/Users/Thenu/Desktop/Image_clustering/flower_images/flower_images/"+str(name)
            isExist = os.path.exists("C:/Users/Thenu/Desktop/Image_clustering/"+str(key))
            if isExist==False:
                os.mkdir("C:/Users/Thenu/Desktop/Image_clustering/"+str(key))
            target = r"C:/Users/Thenu/Desktop/Image_clustering/"+str(key)+"/"+str(name)
            shutil.copyfile(original, target)
    else:
        for name in groups[key]:
            original = r"C:/Users/Thenu/Desktop/Image_clustering/flower_images/flower_images/"+str(name)
            isExist = os.path.exists("C:/Users/Thenu/Desktop/Image_clustering/"+"Couldn't cluster")
            if isExist==False:
                os.mkdir("C:/Users/Thenu/Desktop/Image_clustering/"+"Couldn't cluster")
            target = r"C:/Users/Thenu/Desktop/Image_clustering/"+"Couldn't cluster"+"/"+str(name)
            shutil.copyfile(original, target)

