In [1]:
import numpy as np
import pandas as pd
import os
import cv2

import warnings
warnings.filterwarnings("ignore")

In [2]:
CROPPED_PATH =  'cp'


In [3]:
import torch 
from torch import nn
from PIL import Image
from torchvision import transforms

In [7]:
clean = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [9]:
import torchvision
resnet18=torchvision.models.resnet18(pretrained=True)
resnet18_ls=nn.Sequential(
    *(list(resnet18.children())[:-1])
)

In [10]:
def get_images_and_breeds(path):
    breeds= []
    images = []
    for i,name in enumerate(os.listdir(path)):
        p = os.path.join(path, name)
        files = os.listdir(p)
        imag = [file for file in files if file.lower().endswith(('.jpg'))]
        for image in imag:            
            image_path = os.path.join(p, image)
            im = Image.open(image_path).convert("RGB")
            im = clean(im)
            im = im.unsqueeze(0) # create a mini-batch as expected by the model
            im=resnet18_ls(im).squeeze().detach().numpy()
            images.append(im)
            breeds.append(i)
    return images,breeds

In [11]:
images,breeds=get_images_and_breeds(CROPPED_PATH)
images = np.array(images)
breeds  = np.array(breeds)

In [13]:
from sklearn.decomposition import PCA
pca_model=PCA(2)
pca_output=pca_model.fit_transform(images)

In [53]:
fowlkes_scores=[]
silhouette_scores=[]

In [54]:
from sklearn.cluster import KMeans,BisectingKMeans,SpectralClustering,AgglomerativeClustering
from sklearn.metrics import silhouette_score,fowlkes_mallows_score


kmeans = KMeans(n_clusters=4, random_state=42, init="random").fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,kmeans.labels_),'kmeans_random'))
silhouette_scores.append((silhouette_score(pca_output,kmeans.labels_),'kmeans_random'))

In [55]:
kmeans_2 = KMeans(n_clusters=4, init='k-means++', random_state=42).fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,kmeans_2.labels_),'kmeans_++'))
silhouette_scores.append((silhouette_score(pca_output,kmeans_2.labels_),'kmeans_++'))


In [56]:
Bikmeans=  BisectingKMeans(n_clusters=4, init='random', random_state=42).fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,Bikmeans.labels_),'bisecting'))
silhouette_scores.append((silhouette_score(pca_output,Bikmeans.labels_),'bisecting'))


In [57]:
speclus= SpectralClustering(n_clusters=4, random_state=42).fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,speclus.labels_),'spectral'))
silhouette_scores.append((silhouette_score(pca_output,speclus.labels_),'spectral'))


In [58]:
single = AgglomerativeClustering(n_clusters=4, linkage='single').fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,single.labels_),'single_link'))
silhouette_scores.append((silhouette_score(pca_output,single.labels_),'single_link'))


In [59]:
complete = AgglomerativeClustering(n_clusters=4, linkage='complete').fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,complete.labels_),'complete_link'))
silhouette_scores.append((silhouette_score(pca_output,complete.labels_),'complete_link'))


In [60]:
average = AgglomerativeClustering(n_clusters=4, linkage='average').fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,average.labels_),'average_link'))
silhouette_scores.append((silhouette_score(pca_output,average.labels_),'average_link'))


In [61]:
ward = AgglomerativeClustering(n_clusters=4, linkage='ward').fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,ward.labels_),'ward_link'))
silhouette_scores.append((silhouette_score(pca_output,ward.labels_),'ward_link'))


In [62]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.03, min_samples=15).fit(pca_output)
fowlkes_scores.append((fowlkes_mallows_score(breeds,dbscan.labels_),'dbscan'))
silhouette_scores.append((silhouette_score(pca_output,dbscan.labels_),'dbscan'))
clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
print(f"Number of clusters: {clusters}")

Number of clusters: 4


In [64]:
fowlkes_scores,silhouette_scores

([(0.3446264024179339, 'kmeans_random'),
  (0.3438466572371958, 'kmeans_++'),
  (0.34553442000855705, 'bisecting'),
  (0.38845112624769584, 'spectral'),
  (0.501976775386379, 'single_link'),
  (0.35491813006521516, 'complete_link'),
  (0.4608464826702657, 'average_link'),
  (0.35726030617945054, 'ward_link'),
  (0.3700139652859362, 'dbscan')],
 [(0.35777664, 'kmeans_random'),
  (0.3574599, 'kmeans_++'),
  (0.32976735, 'bisecting'),
  (0.33306286, 'spectral'),
  (0.26334122, 'single_link'),
  (0.25019923, 'complete_link'),
  (0.25283995, 'average_link'),
  (0.33225697, 'ward_link'),
  (-0.22010207, 'dbscan')])

Best to worst fowlkes_scorws of our models in order:

In [66]:
sorted(fowlkes_scores,key=lambda x:x[0],reverse=True)

[(0.501976775386379, 'single_link'),
 (0.4608464826702657, 'average_link'),
 (0.38845112624769584, 'spectral'),
 (0.3700139652859362, 'dbscan'),
 (0.35726030617945054, 'ward_link'),
 (0.35491813006521516, 'complete_link'),
 (0.34553442000855705, 'bisecting'),
 (0.3446264024179339, 'kmeans_random'),
 (0.3438466572371958, 'kmeans_++')]

Best to worst silhouette scores of our models in order :

In [67]:
sorted(silhouette_scores,key=lambda x:x[0],reverse=True)

[(0.35777664, 'kmeans_random'),
 (0.3574599, 'kmeans_++'),
 (0.33306286, 'spectral'),
 (0.33225697, 'ward_link'),
 (0.32976735, 'bisecting'),
 (0.26334122, 'single_link'),
 (0.25283995, 'average_link'),
 (0.25019923, 'complete_link'),
 (-0.22010207, 'dbscan')]

reference :
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    
    https://discuss.pytorch.org/t/nn-sequential-of-the-children-of-a-model-has-different-output-than-that-model/149283
    
    https://pytorch.org/hub/pytorch_vision_resnet/
    
    
    
    
    

