 # Pokemon Image Clustering #

In [1]:
# for loading/processing the images
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input
# for other things
import random
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
import os

pokemons = []
rootdir = '../data/PokemonData'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        pokemons.append([file, subdir.split('\\')[-1]])
        
print(pokemons[:10])

[['0282b2f3a22745f1a436054ea15a0ae5.jpg', 'Abra'], ['06b9eec4827d4d49b1b4c284308708df.jpg', 'Abra'], ['10a9f06ec6524c66b779ea80354f8519.jpg', 'Abra'], ['1788abb8b51f48509cfac8067bd99e14.jpg', 'Abra'], ['28cfad92ad934d1f9b579cbff4b5d012.jpg', 'Abra'], ['2eb2a528f9a247358452b3c740df69a0.jpg', 'Abra'], ['2fd28e699b7c4208acd1637fbad5df2d.jpeg', 'Abra'], ['32240b108a8140f8b31c495166fc453c.jpg', 'Abra'], ['34532bb006714727ade4075f0a72b92d.jpg', 'Abra'], ['3680c3f65a484c3ba05a7cb93e1d7ae3.jpg', 'Abra']]


In [3]:
pokemon_labels = pd.DataFrame(pokemons, columns = ['FileName', 'Label'])
# credit to https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34
#    for help with image processing
# load the image as a 224x224 array
import matplotlib.image as mpimg
import imghdr
img_list = []
for pokemon in range(len(pokemon_labels)):
    img_path = '../data/PokemonData/'+pokemon_labels.iloc[pokemon]['Label']+'/'+pokemon_labels.iloc[pokemon]['FileName']
    img_type = imghdr.what(img_path)
    if img_type != "png" and img_type != "jpg" and img_type != "jpeg":
        os.remove(img_path)
    else:
        img = load_img(img_path, target_size=(224,224))
        # convert from 'PIL.Image.Image' to numpy array
        img = np.array(img)
        reshaped_img = img.reshape(224,224,3)
        x = preprocess_input(reshaped_img)
        img_list.append([pokemon_labels.iloc[pokemon]['Label'],x])
        #if pokemon_labels.iloc[pokemon]['Label'] in img_dict.keys(): img_dict[pokemon_labels.iloc[pokemon]['Label']].append(np.abs(x))
        #else : img_dict[pokemon_labels.iloc[pokemon]['Label']] = [np.abs(x)]


In [4]:
class img_K_means:
    def __init__(self,img_list,K,dist_func):
        self.imgs       = img_list #dictionary
        self.clustroids = np.asarray(random.sample(self.imgs, K))
        self.centroids  = np.zeros([K,224,224,3])
        self.distances  = np.zeros([K])
        self.clusters   = {}
        self.k = K
        self.dist_func  = dist_func
        for img_tuple in self.clustroids: 
            self.clusters[img_tuple[0]] = [img_tuple]
        
        
    def cluster(self):
        ctr=0
        for img_tuple in self.imgs:      # For each data point
            for j, clustroid_tuple in enumerate(self.clustroids):                       # go through each clustroid
                self.distances[j] = self.squared_dist(clustroid_tuple[1],img_tuple[1])  # and calculate distance to clustroid.
            index = np.argmin(self.distances)                            # Get the index of the closest clustroid
            if self.clustroids[index][0] in self.clusters.keys(): self.clusters[self.clustroids[index][0]].append(img_tuple)
            else: self.clusters[self.clustroids[index][0]] = [img_tuple]   #assign to corresponding cluster

                
                #for tuples in self.clusters[self.clustroids[index]]: print(tuples[0])
                #print("one point assigned: ", img_label, ctr)
                
    def compute_centroids(self):
        new_clusters = {}
        for index,cluster in enumerate(self.clusters.keys()):
            if index>self.k: break #out of bounds
            centroid = np.zeros([224,224,3])
            for img_tuple in self.clusters[cluster]: 
                centroid += np.abs(img_tuple[1])
            self.centroids[index] = np.abs(centroid)/len(self.clusters[cluster]) #averages the image RGB's in the cluster list
            ctr=0
            clustroid_tuple=["",np.zeros([224,224,3])]
            distances = np.full((len(self.clusters[cluster])),1000000) #start with large distances so min() doesn't grab 0's
            for cluster_tuple in self.clusters[cluster]: 
                distances[ctr] = self.squared_dist(cluster_tuple[1],self.centroids[index])
                if distances[ctr] == min(distances): 
                    clustroid_tuple = cluster_tuple
                ctr+=1
            if clustroid_tuple[0] == "": #sometimes clustroid won't get filled by this point. Not sure why, so here's my caveman solution
                clustroid_tuple = random.sample(self.imgs,1)
        
            print("new clustroid:",clustroid_tuple[0])
            new_clusters[clustroid_tuple[0]] = [clustroid_tuple] #if we get an error here, collision and thats not great
            self.clustroids[index]  = clustroid_tuple
        print(self.k, "clusters:")
        for i in self.clusters.keys():
            print(i,len(self.clusters[i]))
        self.clusters.clear()
        self.clusters = new_clusters.copy()
        
    def squared_dist(self,img1,img2):
        return np.sqrt(np.sum((img1-img2)**2))
    
    def fit(self):
        cloysters = self.clusters.copy()
        for i in range(25):
            self.compute_centroids()
            self.cluster()
            if cloysters == self.clusters: break
            cloysters.clear()
            cloysters = self.clusters.copy()
            
                

In [None]:
Ks = [2,5,10,20,50,150]
for k in Ks:
    k_cluster = img_K_means(img_list,k,0)
    k_cluster.fit()
    for cluster in k_cluster.clusters.keys():
        print("clustroid:",cluster,"number of imgs in this cluster: ",len(k_cluster.clusters[cluster]))
        

  return array(a, dtype, copy=False, order=order)


new clustroid: Parasect
new clustroid: Rapidash
2 clusters:
Parasect 1
Rapidash 1
new clustroid: Rhydon
new clustroid: Clefable
2 clusters:
Parasect 4679
Rapidash 2142
new clustroid: Exeggcute
new clustroid: Cubone
2 clusters:
Rhydon 3999
Clefable 2822
new clustroid: Exeggcute
new clustroid: Pidgey
2 clusters:
Exeggcute 2473
Cubone 4348
new clustroid: Chansey
new clustroid: Onix
2 clusters:
Exeggcute 1911
Pidgey 4910
new clustroid: Chansey
new clustroid: Onix
2 clusters:
Chansey 1284
Onix 5537
clustroid: Chansey number of imgs in this cluster:  1284
clustroid: Onix number of imgs in this cluster:  5537
new clustroid: Kangaskhan
new clustroid: Pikachu
new clustroid: Caterpie
new clustroid: Articuno
new clustroid: Omastar
5 clusters:
Kangaskhan 1
Pikachu 1
Caterpie 1
Articuno 1
Omastar 1
new clustroid: Onix
new clustroid: Clefable
new clustroid: Oddish
new clustroid: Chansey
new clustroid: Exeggcute
5 clusters:
Kangaskhan 4397
Pikachu 447
Caterpie 751
Articuno 659
Omastar 570
new clustro

new clustroid: Chansey
new clustroid: Vulpix
new clustroid: Kabuto
new clustroid: Weezing
new clustroid: Meowth
new clustroid: Porygon
new clustroid: Gloom
20 clusters:
Kingler 2
Onix 704
Rhydon 369
Kadabra 512
Raichu 2
Pikachu 2
Kangaskhan 919
Mewtwo 2129
Venusaur 2
Charmander 2
Ditto 518
Exeggcute 7
Chansey 477
Vulpix 617
Nidoking 89
Weezing 440
Meowth 2
Porygon 28
Gloom 17
new clustroid: Kingler
new clustroid: Onix
new clustroid: Rhydon
new clustroid: Kadabra
new clustroid: Raichu
new clustroid: Pikachu
new clustroid: Kangaskhan
new clustroid: Wartortle
new clustroid: Venusaur
new clustroid: Charmander
new clustroid: Ditto
new clustroid: Exeggcute
new clustroid: Chansey
new clustroid: Vulpix
new clustroid: Graveler
new clustroid: Weezing
new clustroid: Meowth
new clustroid: Porygon
new clustroid: Gloom
20 clusters:
Kingler 2
Onix 772
Rhydon 377
Kadabra 507
Raichu 2
Pikachu 2
Kangaskhan 1245
Wartortle 1147
Venusaur 2
Charmander 2
Ditto 1137
Exeggcute 7
Chansey 477
Vulpix 616
Kabuto 7