In [None]:
import cv2
import numpy as np
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.models import Model
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from glob import glob
import os
from sklearn.decomposition import PCA
import collections


In [None]:
# Load images
image_paths = glob('/home/utku/Masaüstü/BIL476/runs/detect/predict/crops/person/*.jpg') + glob('{HOME}/runs/detect/predict/crops/person/*.jpg') + glob('{HOME}/runs/detect/predict/crops/person/*.jpg')
images=[]
print(f"Found {len(image_paths)} images.")


In [None]:
# Preprocess images and handle errors
def preprocess_image(image_path, target_size=(224, 224)):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Unable to load image: {image_path}")
        image = cv2.resize(image, target_size)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = preprocess_input(image)
        return image
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

print("Preprocessing images...")
preprocessed_images = [preprocess_image(img_path) for img_path in image_paths]
preprocessed_images = [img for img in preprocessed_images if img is not None]  # Filter out failed images

# Ensure there are images to process
if len(preprocessed_images) == 0:
    raise ValueError("No valid images found. Please check the image paths and formats.")

preprocessed_images = np.array(preprocessed_images)
print(f"Shape of preprocessed images: {preprocessed_images.shape}")

# Load VGG16 model + higher level layers
print("Loading VGG16 model...")
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

# Extract features in smaller batches to handle memory issues
batch_size = 16
features = []

print("Extracting features in batches...")
for i in range(0, len(preprocessed_images), batch_size):
    batch = preprocessed_images[i:i + batch_size]
    try:
        batch_features = model.predict(batch)
        features.append(batch_features)
        print(f"Processed batch {i // batch_size + 1}/{len(preprocessed_images) // batch_size + 1}")
    except Exception as e:
        print(f"Error processing batch {i // batch_size + 1}: {e}")

features = np.concatenate(features, axis=0)
print(f"Shape of extracted features: {features.shape}")

# Reduce dimensionality for faster clustering
print("Performing PCA to reduce dimensionality...")
try:
    pca = PCA(n_components=50)
    reduced_features = pca.fit_transform(features)
    print(f"Shape of reduced features: {reduced_features.shape}")
except Exception as e:
    print(f"Error during PCA transformation: {e}")
    reduced_features = features  # Fall back to original features if PCA fails

# Initial KMeans clustering
print("Performing initial KMeans clustering...")
try:
    kmeans = KMeans(n_clusters=10)  # Start with a higher number of clusters
    kmeans.fit(reduced_features)
    initial_labels = kmeans.labels_
    print("KMeans clustering completed successfully.")
except Exception as e:
    print(f"Error during KMeans clustering: {e}")
    initial_labels = np.zeros(len(reduced_features))  # Fall back to a single cluster

# Define desired cluster size range
min_cluster_size = 3
max_cluster_size = 10

# Function to merge small clusters and split large clusters
def adjust_cluster_sizes(labels, features, min_size, max_size):
    print("Adjusting cluster sizes...")
    label_counts = collections.Counter(labels)
    unique_labels = list(label_counts.keys())
    print(f"Initial label counts: {label_counts}")
    
    new_labels = labels.copy()
    next_label = max(unique_labels) + 1
    
    # Merge small clusters
    for label in unique_labels:
        if label_counts[label] < min_size:
            print(f"Merging small cluster {label} with {label_counts[label]} images.")
            # Find the nearest cluster to merge
            cluster_indices = np.where(labels == label)[0]
            cluster_features = features[cluster_indices]
            distances = np.linalg.norm(features[:, None] - cluster_features[None, :], axis=2).mean(axis=1)
            nearest_label = np.argmin([distances[labels == lbl].mean() for lbl in unique_labels if lbl != label])
            new_labels[cluster_indices] = nearest_label
    
    # Split large clusters
    for label in unique_labels:
        if label_counts[label] > max_size:
            print(f"Splitting large cluster {label} with {label_counts[label]} images.")
            cluster_indices = np.where(labels == label)[0]
            cluster_features = features[cluster_indices]
            sub_kmeans = KMeans(n_clusters=(label_counts[label] // max_size) + 1)
            sub_labels = sub_kmeans.fit_predict(cluster_features)
            for sub_label in np.unique(sub_labels):
                new_labels[cluster_indices[sub_labels == sub_label]] = next_label
                next_label += 1

    return new_labels

# Adjust cluster sizes
adjusted_labels = adjust_cluster_sizes(initial_labels, reduced_features, min_cluster_size, max_cluster_size)
print("Cluster size adjustment completed.")

# Function to plot clusters
def plot_clusters(images, labels):
    unique_labels = np.unique(labels)
    fig, ax = plt.subplots(len(unique_labels), 1, figsize=(15, 5*len(unique_labels)))
    
    if len(unique_labels) == 1:
        ax = [ax]
    
    for label in unique_labels:
        cluster_indices = np.where(labels == label)[0]
        cluster_images = [images[idx] for idx in cluster_indices]
        
        for idx, img in enumerate(cluster_images):
            ax[label].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            ax[label].axis('off')
        ax[label].set_title(f'Cluster {label} - {len(cluster_indices)} images')
    
    plt.show()

print("Plotting adjusted clusters...")
# Plot adjusted clusters
plot_clusters(images, adjusted_labels)
print("Script completed successfully.")
