# Load Generic Logo Model

In [1]:
from ultralytics import YOLO

model = YOLO('./generic_logo_model/detect/train/weights/best.pt')

# Predict Logos in Subset of OLM images

In [3]:
import os
import imageio.v3 as iio
from PIL import Image

def convert_heic_to_jpeg(heic_path, output_path):
    """Converts a HEIC file to JPEG using imageio."""
    img = iio.imread(heic_path)
    image = Image.fromarray(img)
    image.save(output_path, format="JPEG")

def process_images(input_dir, output_dir):
    for image_filename in os.listdir(input_dir):

        input_path = os.path.join(input_dir, image_filename)
        if image_filename.lower().endswith('.heic'):
            # Convert HEIC to JPEG
            output_path = os.path.join(input_dir, image_filename.replace('.HEIC', '.jpg').replace('.heic', '.jpg'))
            convert_heic_to_jpeg(input_path, output_path)
            input_path = output_path  # Update input path to the new JPEG

        # Process the image with the model
        if image_filename.endswith('.jpg') or image_filename.endswith('.jpeg') or image_filename.endswith('.png'):
            results = model(input_path, conf=0.05, iou=0)
            for result in results:
                result.save_crop(output_dir)

# Usage
input_directory = '/Users/nickjohnson/olm_data/olm_pics'
output_directory = '/Users/nickjohnson/olm_data/potential_logos'
process_images(input_directory, output_directory)


image 1/1 /Users/nickjohnson/olm_data/olm_pics/8zLshCYBLoQcIXG75mMeGQK7nqksGFlrKDnLVTXC.jpg: 640x480 (no detections), 684.9ms
Speed: 6.5ms preprocess, 684.9ms inference, 3.9ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/Cpr0lFqPpf6sjYhauo75ou6mQdKMh0gjfn4BRe8z.jpg: 480x640 15 logos, 628.9ms
Speed: 1.8ms preprocess, 628.9ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/xaCFtlYwoNEgiyxQQ5q2IO9pkSemZotZMiSlq535.jpg: 480x640 4 logos, 619.4ms
Speed: 1.9ms preprocess, 619.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/kq01BCrlHp27qkDXPolFbnr2xym2bMHMirkDmTiI.jpg: 640x480 2 logos, 643.7ms
Speed: 2.2ms preprocess, 643.7ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/2nACYO9cpOI7xH3LFMBh3ndOXfP0mATOWF6Q7BZj.jpg: 480x640 (no detections), 634.

Invalid SOS parameters for sequential JPEG


image 1/1 /Users/nickjohnson/olm_data/olm_pics/KBg1n4U5xwuq6f81nSxePJqNVtYrQdS1S3FoUoJu.jpeg: 640x480 (no detections), 642.9ms
Speed: 2.0ms preprocess, 642.9ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/f5LFImhV0q639gJu4f9ftyXhlzNRPoq6DvBUuWl7.jpg: 640x480 (no detections), 616.6ms
Speed: 1.7ms preprocess, 616.6ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/uq7cJrQ6NR9W37fqJrjBdXyMufYbYILEmS92kTgN.jpeg: 640x480 (no detections), 656.9ms
Speed: 1.8ms preprocess, 656.9ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/vNWSzqTyjnYC1aGPScxDSBxVL1UC2aZPOadYDOJ1.jpg: 480x640 3 logos, 600.2ms
Speed: 1.8ms preprocess, 600.2ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/3PVJOVspk55GWJ0w3b1txEuPWljodm24P1pzHre4.jpg: 480x640 1 log

Invalid SOS parameters for sequential JPEG


image 1/1 /Users/nickjohnson/olm_data/olm_pics/FCqVce2Yzrqj41IqA5zsuvM7SzbdEGUGAH2LAwhL.jpeg: 640x480 (no detections), 658.5ms
Speed: 2.2ms preprocess, 658.5ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/xOkZ1jG0K3VGClsEKpFuAuAxda9mZYbGDqYWee5B.jpeg: 640x480 1 logo, 606.5ms
Speed: 1.7ms preprocess, 606.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/fR5bJUzBoPvd6snlyHKgfqBS2pHWAdg1r4pG6le8.jpg: 640x480 2 logos, 627.8ms
Speed: 2.2ms preprocess, 627.8ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/E1640FE4-AEA8-4732-AC84-2EB1A8A3D953.jpg: 640x640 4 logos, 841.3ms
Speed: 2.1ms preprocess, 841.3ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /Users/nickjohnson/olm_data/olm_pics/JzScXHdInkoEDHPHEpctu0Jd7YsoQ2UU9Il8Dasd.jpeg: 480x640 2 logos, 624.4ms
Speed: 2

# Method 1 - Cosine Similarity

In [1]:
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
import numpy as np
import os
import torch
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from imgbeddings import imgbeddings

# Initialize the imgbeddings model
ibed = imgbeddings()
vgg_model = VGG16(include_top=False, pooling='avg')

def load_image(img_path):
    return Image.open(img_path).convert('RGB')

def generate_embeddings(image_paths, ibed, vgg_model):
    embeddings = []
    for image_path in image_paths:
        # Load image
        image = load_image(image_path)
        
        # Process for imgbeddings
        ibed_embedding = ibed.to_embeddings(image)
        ibed_embedding = ibed_embedding.flatten()

        # Process for VGG16
        vgg_image = image.resize((224, 224))
        vgg_array = preprocess_input(np.expand_dims(np.array(vgg_image), axis=0))
        with torch.no_grad():
            vgg_embedding = vgg_model.predict(vgg_array).flatten()
        
        # Concatenate embeddings
        combined_embedding = np.concatenate([ibed_embedding, vgg_embedding])
        embeddings.append(combined_embedding)
    
    return np.vstack(embeddings)

dataset_dir = '/Users/nickjohnson/olm_data/clustered_images/unbranded'
image_paths = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.endswith('.jpg')]

X = generate_embeddings(image_paths, ibed, vgg_model)

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(X)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [2]:
from sklearn.cluster import AgglomerativeClustering

clusters = AgglomerativeClustering(n_clusters = None, metric = 'cosine', distance_threshold = 0.3, linkage='average').fit(X)

clusters = clusters.labels_

unique, counts = np.unique(clusters, return_counts=True)
count_dict = dict(zip(unique, counts))

for cluster_id in count_dict:
    if count_dict[cluster_id] < 2:
        clusters = np.where(clusters == cluster_id, -1, clusters)

In [184]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA

# Preprocessing with PCA
pca = PCA(n_components=min(X.shape[0], X.shape[1]), random_state=22)
pca.fit(X)
x = pca.transform(X)

# Initial clustering with KMeans
kmeans = KMeans(n_clusters=4, n_init='auto')
kmeans.fit(x)
kmeans_labels = kmeans.labels_

# Calculate initial counts of items in each cluster
unique, counts = np.unique(kmeans_labels, return_counts=True)
count_dict = dict(zip(unique, counts))

# Initialize the final cluster array with the original KMeans labels
final_clusters = np.copy(kmeans_labels)

# Make a copy of the keys for safe iteration
cluster_ids = list(count_dict.keys())

# Process each cluster to determine if it needs refining
for cluster_id in cluster_ids:
    count = count_dict[cluster_id]
    if count > 15:
        # Get the indices for current cluster items
        indices_to_refine = np.where(kmeans_labels == cluster_id)[0]

        # Apply Agglomerative Clustering to large clusters only
        X_subset = X[indices_to_refine]
        agglo = AgglomerativeClustering(n_clusters=None, metric='cosine', distance_threshold=0.2, linkage='average').fit(X_subset)
        agglo_labels = agglo.labels_

        # Offset the new labels to keep them unique
        offset = np.max(final_clusters) + 1
        refined_labels = agglo_labels + offset
        final_clusters[indices_to_refine] = refined_labels

        # Update count_dict with new labels and counts
        unique_refined, counts_refined = np.unique(refined_labels, return_counts=True)
        refined_dict = dict(zip(unique_refined, counts_refined))
        for key, val in refined_dict.items():
            count_dict[key] = val  # Update or add new counts

# Handle single-member clusters after all refinements
for cluster_id, count in list(count_dict.items()):
    if count < 2:
        final_clusters = np.where(final_clusters == cluster_id, -1, final_clusters)

# Optionally, recompute count dictionary to ensure accuracy
unique_final, counts_final = np.unique(final_clusters, return_counts=True)
final_count_dict = dict(zip(unique_final, counts_final))

In [3]:
import os
import shutil
import numpy as np

def organize_images_by_cluster(image_paths, clusters, base_dir='/Users/nickjohnson/olm_data/reclustered_images'):
    # Create a directory for each cluster
    for cluster_id in np.unique(clusters):
        cluster_dir = os.path.join(base_dir, f'cluster_{cluster_id}')
        if not os.path.exists(cluster_dir):
            os.makedirs(cluster_dir)
            print(f"Created directory: {cluster_dir}")
        else:
            print(f"Directory already exists: {cluster_dir}")

    # Copy images to their respective cluster directories
    for path, cluster_id in zip(image_paths, clusters):
        # Define the destination directory
        cluster_dir = os.path.join(base_dir, f'cluster_{cluster_id}')
        destination_path = os.path.join(cluster_dir, os.path.basename(path))
        # Copy the image to the new directory
        shutil.copy(path, destination_path)
        print(f"Copied {path} to {destination_path}")

# Example usage
image_paths = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.endswith('.jpg')]

# Call the function to organize images
organize_images_by_cluster(image_paths, clusters)

Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_-1
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_0
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_1
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_2
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_3
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_4
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_5
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_6
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_7
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_8
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_9
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_10
Created directory: /Users/nickjohnson/olm_data/reclustered_images/cluster_11
Created d