In [1]:
import os
import sys
import yaml
import wandb
import random
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import time
from torch.utils.data import DataLoader
from PIL import Image
from sklearn import cluster

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%pip install wandb -qU
%matplotlib inline

# Get the current working directory
notebook_dir = notebook_dir = os.path.dirname(os.path.abspath("__file__"))  
project_dir = os.path.abspath(os.path.join(notebook_dir, '..')) 
if project_dir not in sys.path:
    sys.path.append(project_dir)

from src import (get_transforms, load_data, split_data, set_seeds, 
                 verify_splits, verify_data, plot_species_grid,
                 verify_loader_transforms)
from src.config import load_config
from src.data_utils import ImagesDataset
from src.models import build_resnet50_basic
from src.train import setup_training, evaluate, train

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Locate the YAML file relative to the notebook's location
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# You need to update this path to your new .yaml file
config_path = os.path.join(notebook_dir, "../configs/kevins_mach.yaml")

# Load the YAML file
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

In [3]:
print(f"Running torch v: {torch.__version__}")
device = config["device"]
print(f"Running on {device}...")


Running torch v: 2.5.1
Running on cuda...


In [4]:
train_features, test_features, train_labels, species_labels = load_data()

In [22]:
output_dir = "../data/preprocessed/"

if(os.path.isdir(output_dir)):
    print("Directory found!...")
    os.makedirs("../data/preprocessed/cluster0/")
    os.makedirs("../data/preprocessed/cluster1/")
else:
    os.makedirs(output_dir)
    print("Making directory!...")
    os.makedirs("../data/preprocessed/cluster0/")
    os.makedirs("../data/preprocessed/cluster1/")




Directory found!...


In order to use sklearn, we need to transform the input images to numpy arrays. This cell might take a while.... (took about 4mins for me)

In [8]:
train_array_list = []
test_array_list = []

for path in train_features["filepath"]:
    try:
        im = Image.open(path).resize((224,224)).convert("RGB")

        train_array_list.append(np.array(im).flatten())
    except:
        print(f"Couldn't open or convert image at: {path}!...")


for path in test_features["filepath"]:
    try:
        im = Image.open(path).resize((224,224)).convert("RGB")
        test_array_list.append(np.array(im).flatten())
    except:
        print(f"Couldn't open or convert image at: {path}!...")
        

The following is going to be resource intensive - fitting this is not that easy.

In [9]:

clusterer = cluster.KMeans(n_clusters=config["preprocessor"]["cluster_count"], max_iter = config["preprocessor"]["iteration_count"])

print(train_array_list[0].shape)

train_array = np.array(train_array_list)
test_array = np.array(test_array_list)

clusterer.fit(train_array)


print(f"Cluster centers at: {clusterer.cluster_centers_}")


(150528,)
Cluster centers at: [[ 75.23558497  84.93859159  81.65156711 ... 194.8475955  196.70364463
  196.83395805]
 [156.03064067 156.13750317 155.9275766  ... 221.03874399 221.10762218
  221.06229425]]


In [29]:
test = test_features["filepath"][0]
print(test)

splitted = test.split("../data/givens/test_features\\")
print(splitted[1])

out = clusterer.predict(test_array[0].reshape(1,-1))
print(out[0])

../data/givens/test_features\ZJ016488.jpg
ZJ016488.jpg
0


  test = test_features["filepath"][0]


In [None]:
# Lets see how it did:

for idx, path in enumerate(test_features["filepath"]):
    
    pred = clusterer.predict(test_array[idx].reshape(1,-1))
    pred = pred[0]


    if(pred==0):
        new_path = output_dir + "cluster0/" + path.split("../data/givens/test_features\\")[1]
        im = Image.open(path)
        im.save(new_path)

    else:
        new_path = output_dir + "cluster1/" + path.split("../data/givens/test_features\\")[1]
        im = Image.open(path)
        im.save(new_path)


0
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
1
0
0
0
0
0
1
0
0
0
0
1
0
1
0
0
0
1
0
0
0
0
0
1
0
1
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
1
0
1
1
1
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
1
0
1
0
1
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
1
1
1
0
0
0
0
0
0
0
1
0
0
0
1
0
1
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
1
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
1
0
1
0
1
0
0
1
1
0
0
0
0
0
0
0
0
0
0
1
0
1
0
1
0
0
0
1
0
0
0
0
0
1
0
1
1
0
0
1
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
1
0
1
0
0
1
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
1
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
1
1
0
0
0
0
0
1
1
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
0
1
0
0
0
0
0
1
1
0
1
1
0
1
0
0
1
1
