# Feature Extraction

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import hdbscan

## Load Model

In [None]:
model = keras.applications.VGG16(weights='imagenet', include_top=False)

In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

def extract_vgg16_features(image_path, model):
    # Load and preprocess the image
    img = image.load_img(image_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    # Extract features from the desired layer (e.g., block5_pool)
    layer_name = 'block5_pool'  # You can choose a different layer
    intermediate_layer_model = tf.keras.models.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    features = intermediate_layer_model.predict(img)

    return features

## Run Model

In [None]:
import os

img_dir = "/Users/kaavi/Documents/GitHub/term7cv/dataset/objects/train_8"

features_dict = {}

for img_name in os.listdir(img_dir):
    feature = extract_vgg16_features(img_dir + "/" + img_name, model)
    features_dict[img_name] = feature


## Flatten Features

In [None]:
filenames = []
feature_vectors = []

for img, feature in features_dict.items():
    filenames.append(img)
    flattened_feature = np.reshape(feature, (7*7*512))
    feature_vectors.append(flattened_feature)

feature_vectors = np.array(feature_vectors)

## Normalize Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the feature vectors
scaler = StandardScaler()
normalized_features = scaler.fit_transform(feature_vectors)

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True)  
cluster_labels = clusterer.fit_predict(feature_vectors)

# Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

plt.figure().set_figheight(70)

# Compute the linkage matrix
linkage_matrix = linkage(feature_vectors, method='ward')

# Create a dendrogram to visualize the hierarchical structure
dendrogram(linkage_matrix, labels=filenames, orientation="left")

# You can use different parameters for dendrogram visualization to customize it.
plt.title("Hierarchical Clustering Dendrogram")
plt.savefig("test.png")
plt.show()

## Assigning images to Clusters
Need to find a way to automatically determine num_clusters

In [None]:
from scipy.cluster.hierarchy import fcluster

# Cut the dendrogram to get flat clusters
num_clusters = 5  # Adjust the number of clusters as needed
cluster_assignments = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')

# The cluster_assignments contain the cluster labels for each image

In [None]:
print(len(cluster_assignments))
print(cluster_assignments)

## Copying images to respective cluster folder

In [None]:
import shutil

for img, cluster in zip(filenames, cluster_labels):

    img_path_orig = img_dir + "/" + img
    img_path_copy = img_dir + "/" + str(cluster) + "/" + img
    cluster_dir = img_dir + "/" + str(cluster)

    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    shutil.copy(img_path_orig, img_path_copy)
