 # Pokemon Image Clustering #

In [1]:
# for loading/processing the images
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input
# for other things
import random
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
import os

pokemons = []
rootdir = '../data/PokemonData'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        pokemons.append([file, subdir.split('\\')[-1]])
        
print(pokemons[:10])

[['0282b2f3a22745f1a436054ea15a0ae5.jpg', 'Abra'], ['06b9eec4827d4d49b1b4c284308708df.jpg', 'Abra'], ['10a9f06ec6524c66b779ea80354f8519.jpg', 'Abra'], ['1788abb8b51f48509cfac8067bd99e14.jpg', 'Abra'], ['28cfad92ad934d1f9b579cbff4b5d012.jpg', 'Abra'], ['2eb2a528f9a247358452b3c740df69a0.jpg', 'Abra'], ['2fd28e699b7c4208acd1637fbad5df2d.jpeg', 'Abra'], ['32240b108a8140f8b31c495166fc453c.jpg', 'Abra'], ['34532bb006714727ade4075f0a72b92d.jpg', 'Abra'], ['3680c3f65a484c3ba05a7cb93e1d7ae3.jpg', 'Abra']]


In [3]:
pokemon_labels = pd.DataFrame(pokemons, columns = ['FileName', 'Label'])
# credit to https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34
#    for help with image processing
# load the image as a 224x224 array
import matplotlib.image as mpimg
import imghdr
img_dict = {}
for pokemon in range(len(pokemon_labels)):
    img_path = '../data/PokemonData/'+pokemon_labels.iloc[pokemon]['Label']+'/'+pokemon_labels.iloc[pokemon]['FileName']
    img_type = imghdr.what(img_path)
    if img_type != "png" and img_type != "jpg" and img_type != "jpeg":
        os.remove(img_path)
    else:
        img = load_img(img_path, target_size=(224,224))
        # convert from 'PIL.Image.Image' to numpy array
        img = np.array(img)
        reshaped_img = img.reshape(224,224,3)
        x = preprocess_input(reshaped_img)
        if pokemon_labels.iloc[pokemon]['Label'] in img_dict.keys(): img_dict[pokemon_labels.iloc[pokemon]['Label']].append(x)
        else : img_dict[pokemon_labels.iloc[pokemon]['Label']] = [x]

In [None]:
class img_K_means:
    def __init__(self,img_dict,K,dist_func):
        self.imgs       = img_dict #dictionary
        self.clustroids = np.asarray(random.sample(img_dict.keys(), K))
        self.centroids  = [224,224,3]*K
        self.distances  = np.zeros([K])
        self.clusters   = {}
        self.k = K
        self.dist_func  = dist_func
        for i in range(K): self.centroids[i] = random.choice(img_dict[self.clustroids[i]])
        for index,i in enumerate(self.clustroids): self.clusters[i] = [[i,self.centroids[index]]]
        
        
    def cluster(self):
        ctr=0
        for img_label in self.imgs.keys():
            for img_val in self.imgs[img_label]:        # For each data point
                for j, clustroid in enumerate(self.clustroids):                       # go through each clustroid
                    self.distances[j] = self.squared_dist(img_val,self.centroids[j])  # and calculate distance to clustroid.
                ctr+=1
                index = np.argmin(self.distances)                                     # Get the index of the closest clustroid
                if self.clustroids[index] in self.clusters.keys(): self.clusters[self.clustroids[index]].append([img_label,img_val])
                else: self.clusters[self.clustroids[index]] = [[img_label,img_val]]   #assign to corresponding cluster
                #for tuples in self.clusters[self.clustroids[index]]: print(tuples[0])
                #print("one point assigned: ", img_label, ctr)
                
    def compute_centroids(self):
        distances = np.full((6837),1000000) #start with large distances so min() doesn't grab 0's
        print(distances)
        new_clusters = {}
        for index,cluster in enumerate(self.clusters.keys()):
            centroid = np.zeros([224,224,3])
            for tuples in self.clusters[cluster]: 
                centroid += tuples[1]
            print(len(self.clusters[cluster]))
            self.centroids[index] = centroid/len(self.clusters[cluster])
            ctr=0
            clustroid = ""
            clustroid_val = np.zeros([224,224,3])
            for img_label, img_val_list in zip(self.imgs.keys(),self.imgs.values()): 
                #print(img_label)
                for img_val in img_val_list:
                    distances[ctr] = self.squared_dist(img_val,self.centroids[index])
                    if distances[ctr] == min(distances): 
                        print("img_label: ",img_label)
                        clustroid = img_label #gets name of pokemon of closest image to centroid
                        clustroid_val = img_val #value of closest image to centroid
                        #print(min(distances),distances[ctr],img_label,clustroid,img_val)
                    ctr+=1
            #print("new clustroid:",clustroid, "old clustroid:", cluster, "centroid:",self.centroids[index])
            new_clusters[clustroid]=[[clustroid,clustroid_val]] #if we get an error here, collision and thats not great
        print(self.clusters.keys(),new_clusters.keys())
        self.clusters = new_clusters
        
    def squared_dist(self,img1,img2):
        return np.sqrt(np.sum((img1-img2)**2))
    
    def fit(self):
        cloysters = self.clusters
        for i in range(1000):
            self.cluster()
            self.compute_centroids()
            if cloysters == self.clusters: break
            cloysters = self.clusters
            
first_try = img_K_means(img_dict,10,0)
first_try.fit()
                

            

[1000000 1000000 1000000 ... 1000000 1000000 1000000]
425
img_label:  Abra
img_label:  Abra
img_label:  Aerodactyl
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Chansey
img_label:  Cubone
img_label:  Golduck
4
1067
img_label:  Diglett
img_label:  Diglett
img_label:  Koffing
img_label:  Nidorina
390
img_label:  Kadabra
img_label:  Kangaskhan
img_label:  Mew
img_label:  Onix
img_label:  Weezing
270
img_label:  Diglett
99
img_label:  Doduo
img_label:  Tentacool
492
1528
img_label:  Chansey
img_label:  Chansey
img_label:  Cubone
img_label:  Exeggcute
939
1615
img_label:  Diglett
img_label:  Geodude
img_label:  Mew
dict_keys(['Seadra', 'Jynx', 'Golem', 'Squirtle', 'Pinsir', 'Tangela', 'MrMime', 'Snorlax', 'Fearow', 'Aerodactyl']) dict_keys(['Golduck', '', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Mew'])
[1000000 1000000 1000000 ... 1000000 1000000 1000000]
1
img_label:  Abra
img_label:  Abra

432
img_label:  Koffing
img_label:  Tentacool
883
img_label:  Mew
545
img_label:  Vulpix
773
868
1104
img_label:  Onix
img_label:  Weezing
img_label:  Weezing
415
img_label:  Golduck
dict_keys(['Golduck', 'Mew', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Vulpix', 'Geodude', 'Primeape', '', 'Rhydon', 'Koffing', 'MrMime', 'Pinsir', 'Fearow', 'Tangela', 'Jynx', 'Snorlax', 'Aerodactyl', 'Golem', 'Squirtle', 'Seadra']) dict_keys(['Golduck', 'Mew', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Vulpix', 'Geodude', 'Primeape', '', 'Rhydon', 'Koffing'])
[1000000 1000000 1000000 ... 1000000 1000000 1000000]
1
img_label:  Abra
img_label:  Abra
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Cubone
img_label:  Dewgong
img_label:  Golduck
1
img_label:  Mew
1
img_label:  Nidorina
1
img_label:  Weezing
img_label:  Weezing
1
img_label:  Diglett
1
img_label:  Koffing
img_label:  Tentacool
1
img_label:  Exeggcute
1
img_l

img_label:  Weezing
img_label:  Weezing
415
img_label:  Golduck
dict_keys(['Golduck', 'Mew', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Vulpix', 'Geodude', 'Primeape', '', 'Rhydon', 'Koffing', 'MrMime', 'Pinsir', 'Fearow', 'Tangela', 'Jynx', 'Snorlax', 'Aerodactyl', 'Golem', 'Squirtle', 'Seadra']) dict_keys(['Golduck', 'Mew', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Vulpix', 'Geodude', 'Primeape', '', 'Rhydon', 'Koffing'])
[1000000 1000000 1000000 ... 1000000 1000000 1000000]
1
img_label:  Abra
img_label:  Abra
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Cubone
img_label:  Dewgong
img_label:  Golduck
1
img_label:  Mew
1
img_label:  Nidorina
1
img_label:  Weezing
img_label:  Weezing
1
img_label:  Diglett
1
img_label:  Koffing
img_label:  Tentacool
1
img_label:  Exeggcute
1
img_label:  Vulpix
1
img_label:  Geodude
1
img_label:  Primeape
1
1
img_label:  Rhydon
1
img_label:  Koffing
692
507
600
img

[1000000 1000000 1000000 ... 1000000 1000000 1000000]
1
img_label:  Abra
img_label:  Abra
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Alolan Sandslash
img_label:  Cubone
img_label:  Dewgong
img_label:  Golduck
1
img_label:  Mew
1
img_label:  Nidorina
1
img_label:  Weezing
img_label:  Weezing
1
img_label:  Diglett
1
img_label:  Koffing
img_label:  Tentacool
1
img_label:  Exeggcute
1
img_label:  Vulpix
1
img_label:  Geodude
1
img_label:  Primeape
1
1
img_label:  Rhydon
1
img_label:  Koffing
692
507
600
img_label:  Ditto
img_label:  Geodude
432
img_label:  Koffing
img_label:  Tentacool
883
img_label:  Mew
545
img_label:  Vulpix
773
868
1104
img_label:  Onix
img_label:  Weezing
img_label:  Weezing
415
img_label:  Golduck
dict_keys(['Golduck', 'Mew', 'Nidorina', 'Weezing', 'Diglett', 'Tentacool', 'Exeggcute', 'Vulpix', 'Geodude', 'Primeape', '', 'Rhydon', 'Koffing', 'MrMime', 'Pinsir', 'Fearow', 'Tangela', 'Jynx', 'Snorlax', 'Aerodactyl', 'Golem', 'Squirtle', 'Sead

img_label:  Cubone
img_label:  Dewgong
img_label:  Golduck
1
img_label:  Mew
1
img_label:  Nidorina
1
img_label:  Weezing
img_label:  Weezing
1
img_label:  Diglett


In [None]:
print(first_try.clusters)