In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (20, 20)

import os
import itertools
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.cluster import KMeans
from skimage.color import rgb2lab, lab2rgb
from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cosine

from tqdm import tqdm_notebook as tqdm

In [None]:
n_images = 200
path_to_images = '../data/small_images/'

image_ids = np.random.choice(os.listdir(path_to_images), n_images, replace=False)
images = [Image.open(path_to_images + image_id) for image_id in tqdm(image_ids)]
images = [Image.fromarray(np.stack((image,)*3, -1))
          if len(np.array(image).shape) != 3 else image
          for image in images]

In [None]:
pixel_lists = [np.array(image).reshape(-1, 3) for image in images]
image_dict = dict(zip(image_ids, pixel_lists))

according to [wikipedia](https://en.wikipedia.org/wiki/Color_difference), this:

$${\displaystyle \Delta C={\sqrt {2\times \Delta R^{2}+4\times \Delta G^{2}+3\times \Delta B^{2}+{{{\bar {r}}\times (\Delta R^{2}-\Delta B^{2})} \over {256}}}}}$$

is a better colour distance than raw euclidian distance of RGB. Here it is in python

In [None]:
def colour_distance(colour_1, colour_2):
    r_1, g_1, b_1 = colour_1
    r_2, g_2, b_2 = colour_2
    
    d_r_sq = (r_1 - r_2) ** 2
    d_g_sq = (g_1 - g_2) ** 2
    d_b_sq = (b_1 - b_2) ** 2
    mean_r = (r_1 + r_2) / 2
    
    d_c_sq = (2 * d_r_sq + 
              4 * d_g_sq + 
              3 * d_b_sq +
              (mean_r * (d_r_sq - d_b_sq) / 256))

    return d_c_sq ** 0.5

In [None]:
step = 5
r_1 = np.arange(step * 2)
r_2 = np.arange(step * 4)
r_3 = np.arange(step * 3)

bins = [str(list(bin)) for bin in list(itertools.product(r_1, r_2, r_3))]

bin_counts = pd.DataFrame(index=bins)

for image_id, image in tqdm(image_dict.items()):
    bin_counts[image_id] = pd.Series([str([int(i) for i in pixel / step_size])
                                      for pixel in image]).value_counts()

bin_counts = bin_counts.fillna(0)

In [None]:
embedding = UMAP().fit_transform(bin_counts.T.values)

plt.scatter(x=embedding[:, 0], y=embedding[:, 1]);

In [None]:
image_dict = dict(zip(image_ids, images))

similarity = pd.DataFrame(data=[[cosine(bin_counts[image_1], bin_counts[image_2])
                                 for image_1 in image_ids] for image_2 in tqdm(image_ids)],
                          index=image_ids,
                          columns=image_ids)

In [None]:
sns.heatmap(similarity);

In [None]:
query_id = np.random.choice(image_ids)
image_dict[query_id]

In [None]:
resolution = 200
n_similar = 10

most_similar_ids = similarity[query_id].sort_values().index.values[1 : n_similar + 1]
similar_images = [image_dict[id].resize((resolution, resolution)) for id in most_similar_ids]
Image.fromarray(np.hstack([np.array(image) 
                           for image in similar_images])
                .reshape(resolution, n_similar * resolution, 3))

# start exploring CIELAB space instead

https://www.slideshare.net/slideshow/embed_code/key/K4j44642NfjNLS

- http://gfx.cs.princeton.edu/pubs/Chang_2015_PPR/chang2015-palette_small.pdf
- https://sci-hub.tw/https://epubs.siam.org/doi/10.1137/05064206X

- deterministic seeding strategy that increases distance between clusters before running k-means. not sure what to do about this. seems like thesis p3.2 details it



In [None]:
lab_images = [color.rgb2lab(np.array(image)).reshape(-1, 3) for image in tqdm(images)]

In [None]:
lab_image_dict = dict(zip(image_ids, lab_images))

In [None]:
bins = [str(list(bin)) for bin in list(itertools.product(range(16), range(16), range(16)))]

bin_counts = pd.DataFrame(index=bins)

for image_id, image in tqdm(lab_image_dict.items()):
    bin_counts[image_id] = pd.Series([str([int(i) for i in pixel / step_size])
                                      for pixel in image]).value_counts()

bin_counts = bin_counts.fillna(0)

In [None]:
embedding = UMAP().fit_transform(bin_counts.T.values)

plt.scatter(x=embedding[:, 0], y=embedding[:, 1]);

In [None]:
AgglomerativeClustering(n_clusters=5).fit_predict(lab_images[0])

### get `grid_centres` in rgb space

In [None]:
path_to_images = '../data/small_images/'
image_id = np.random.choice(os.listdir(path_to_images))
image = Image.open(path_to_images + image_id)

if len(np.array(image).shape) != 3:
    image = Image.fromarray(np.stack((image,)*3, -1))

print(image_id.replace('.jpg', ''))

image

In [None]:
r, g, b = np.array(image).reshape(3, -1)

In [None]:
hist, bins = np.histogramdd(np.array(image).reshape(-1, 3),
                            bins=16,
                            range=[[0, 255], [0, 255], [0, 255]])

bin_vector = hist.flatten()

keep track of the mean lab colour for each bin

In [None]:
from skimage.color import rgb2lab

In [None]:
mean_bin_lab = rgb2lab((np.array(list((itertools.product(bins[0][:-1], 
                                                         repeat=3)))) 
                        + 8)
                       .reshape(64, 64, 3))

below is our unstacked colour space of 4096 LAB colours

In [None]:
Image.fromarray((np.array(list((itertools.product(bins[0][:-1], repeat=3)))) + 8).reshape(16, 256, 3).astype(np.uint8))

instead of using the exact colour we'll instead use the bin centers

# solving the assignment problem
this doesn't need to be perfect - hopefully [CIE76](https://en.wikipedia.org/wiki/Color_difference#CIELAB_Delta_E*) (ie. euclidian distance in CIELAB space) is sufficient.

In [None]:
path_to_images = '../data/small_images/'
image_1 = Image.open(path_to_images + np.random.choice(os.listdir(path_to_images)))
image_2 = Image.open(path_to_images + np.random.choice(os.listdir(path_to_images)))

In [None]:
image_1

In [None]:
image_2

In [None]:
def get_palette(image, palette_size=5, image_size=100):
    if len(np.array(image).shape) < 3:
        image = Image.fromarray(np.stack((image,)*3, -1))
        
    image = image.resize((image_size, image_size))
    lab_image = rgb2lab(np.array(image)).reshape(-1, 3)
    clusters = KMeans(n_clusters=palette_size).fit(lab_image)
    return [colour.tolist() for colour in clusters.cluster_centers_]


def display_palette(palette_colours, palette_size=5, image_size=100):
    stretched_colours = [(lab2rgb(np.array(colour * image_size * image_size)
                                  .reshape(image_size, image_size, 3)) * 255)
                         .astype(np.uint8) 
                         for colour in palette_colours]
    
    palette_array = (np.hstack(stretched_colours)
                     .reshape((image_size, 
                               image_size * palette_size, 
                               3)))

    return Image.fromarray(palette_array)


def colour_distance(colour_1, colour_2):
    return sum([(a - b) ** 2 for a, b in zip(colour_1, colour_2)]) ** 0.5

In [None]:
palette_1 = get_palette(image_1)
display_palette(palette_1)

In [None]:
palette_2 = get_palette(image_2)
display_palette(palette_2)

In [None]:
distances = [[sum(np.array(c_1) - np.array(c_2)) ** 2
              for c_1 in palette_1] 
             for c_2 in palette_2]

reorder_2, reorder_1 = linear_sum_assignment(distances)

In [None]:
palette_1_reordered = [palette_1[i] for i in reorder_1]
display_palette(palette_1_reordered)

In [None]:
palette_2_reordered = [palette_2[i] for i in reorder_2]
display_palette(palette_2_reordered)

# large scale palette similarity

In [None]:
palette_1, palette_2 = [palettes[id] for id in np.random.choice(list(image_ids), 2)]

In [None]:
palette_1, palette_2 = [palettes[id] for id in np.random.choice(list(image_ids), 2)]

def palette_distance(palette_1, palette_2):
    distances = [[colour_distance(c_1, c_2)
              for c_2 in palette_2] for c_1 in palette_1]

    _, rearrangement = linear_sum_assignment(distances)
    palette_2 = [palette_2[i] for i in rearrangement]

    palette_distance = sum([colour_distance(c_1, c_2) 
                            for c_1, c_2 in zip(palette_1, palette_2)])
    
    return palette_distance

In [None]:
palette_distances = pd.DataFrame({id_1: {id_2: palette_distance(p_1, p_2) 
                                         for id_2, p_2 in palettes.items()}
                                  for id_1, p_1 in tqdm(palettes.items())})

In [None]:
sns.heatmap(palette_distances);

In [None]:
query_id = np.random.choice(image_ids)
most_similar_palette = palette_distances[query_id].sort_values().index.values[1]
print(query_id, most_similar_palette)

In [None]:
display_palette(get_palette(Image.open('../data/small_images/' + query_id)))

In [None]:
display_palette(get_palette(Image.open('../data/small_images/' + most_similar_palette)))

# palette-based search

In [None]:
query_id = np.random.choice(image_ids)

resolution = 500
n_similar = 10

most_similar_ids = palette_distances[query_id].sort_values().index.values[:n_similar]
similar_images = [images[image_id].resize((resolution, resolution)) 
                  for image_id in most_similar_ids]

Image.fromarray(np.hstack([np.array(image) for image in similar_images]).reshape(resolution, n_similar*resolution, 3))