In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (20, 20)

import os
import itertools
import numpy as np
import pandas as pd
from PIL import Image
from scipy.spatial.distance import cosine, cityblock

from tqdm import tqdm_notebook as tqdm

In [None]:
size = 50
n_images = size ** 2
path_to_images = '../data/small_images/'

image_ids = np.random.choice(os.listdir(path_to_images), n_images, replace=False)
images = [Image.open(path_to_images + image_id) for image_id in tqdm(image_ids)]
images = [Image.fromarray(np.stack((image,)*3, -1))
          if len(np.array(image).shape) != 3 else image
          for image in images]

In [None]:
histograms = [np.array(image.histogram()).reshape(-1, 3).T
              for image in images]

In [None]:
def moving_average(arr, n):
    cumsum = np.cumsum(arr)
    return ((cumsum[n:] - cumsum[:-n])[n - 1:] / n)


def smooth_histogram(hist, n=10):
    r, g, b = hist
    return np.concatenate([moving_average(r, n), 
                           moving_average(g, n), 
                           moving_average(b, n)])

In [None]:
smooth_histograms = [smooth_histogram(h) for h in histograms]

In [None]:
image_dict = dict(zip(image_ids, images))

similarity = pd.DataFrame(data=[[cosine(h_1, h_2) 
                                 for h_1 in smooth_histograms] 
                                for h_2 in tqdm(smooth_histograms)],
                          index=image_ids,
                          columns=image_ids)

In [None]:
sns.heatmap(similarity);

In [None]:
resolution = 100
    
height = int(resolution * size)
width = int(resolution * size)

big_image = np.empty((height, width, 3)).astype(np.uint8)
grid = np.array(list(itertools.product(range(size), range(size))))
sq_images = [image.resize((resolution, resolution)) for image in images]

for pos, image in zip(grid, sq_images):
    block_t, block_l = pos * resolution
    block_b, block_r = (pos + 1) * resolution
    
    big_image[block_t : block_b, block_l : block_r] = np.array(image)

Image.fromarray(big_image)

In [None]:
query_id = np.random.choice(image_ids)
#query_id = image_ids[50*14 + 11]
image_dict[query_id]

In [None]:
most_similar_ids = similarity[query_id].sort_values().index.values[1:6]
similar_images = [image_dict[id].resize((300, 300)) for id in most_similar_ids]
Image.fromarray(np.hstack([np.array(image) for image in similar_images]).reshape(300, 1500, 3))