# 0. Env

In [None]:
from mecab import MeCab
from base64 import b64decode
import cv2
import numpy as np

from datasets import load_dataset

import matplotlib.pyplot as plt

# 1. Color Histogram
- 참고: https://www.pinecone.io/learn/series/image-search/color-histograms/

## 1.1. Processing

In [None]:
data = load_dataset('pinecone/image-set', split='train', revision='e7d39fc')

In [None]:
# 데이터 확인
data

In [None]:
data[0]

In [None]:
# base64 to numpy array
image_bytes = b64decode(data[0]['image_bytes'])
image_bytes = np.frombuffer(image_bytes, np.uint8)
image_bytes.shape

In [None]:
# decode encoded bgr_image
bgr_image = cv2.imdecode(image_bytes, cv2.IMREAD_COLOR)
bgr_image.shape

In [None]:
plt.imshow(bgr_image)

In [None]:
# BGR (OpenCV) -> RGB (matplotlib)
rgb_image = np.flip(bgr_image, 2)
plt.imshow(rgb_image)

In [None]:
# 이미지 변환 함수
def process_fn(sample):
    image_bytes = b64decode(sample['image_bytes'])
    image_bytes = np.frombuffer(image_bytes, np.uint8)
    bgr_image = cv2.imdecode(image_bytes, cv2.IMREAD_COLOR)
    return bgr_image

In [None]:
# 전체 이미지 변환
images = [process_fn(sample) for sample in data]

In [None]:
# 이미지 확인
bgr_image = images[2]
rgb_image = np.flip(bgr_image, 2)
plt.imshow(rgb_image)
plt.show()

## 1.2. Gray Tutorial

In [None]:
# gray 이미지로 변환
gray_image = cv2.cvtColor(images[0], cv2.COLOR_BGR2GRAY)
plt.imshow(gray_image, cmap='gray')
plt.axis('off')
plt.show()

In [None]:
gray_image.shape

In [None]:
# 256 bins histogram
hist, _ = np.histogram(gray_image, 256, [0, 256])
plt.plot(hist, 'r')
plt.show()

In [None]:
# 128 bins histogram
hist, _ = np.histogram(gray_image, 128, [0, 256])
plt.plot(hist, 'g')
plt.show()

In [None]:
# 64 bins histogram
hist, _ = np.histogram(gray_image, 64, [0, 256])
plt.plot(hist, 'b')
plt.show()

## 1.3. Color Tutorial

In [None]:
# rgb 이미지로 변환
rgb_image = cv2.cvtColor(images[0], cv2.COLOR_BGR2RGB)
plt.imshow(rgb_image)
plt.axis('off')
plt.show()

In [None]:
rgb_image.shape

In [None]:
# r, g, b 분리
red_image = rgb_image[:, :, 0]
green_image = rgb_image[:, :, 1]
blue_image = rgb_image[:, :, 2]

red_image.shape, green_image.shape, blue_image.shape

In [None]:
# 256 bins histogram
red_hist, _ = np.histogram(red_image, 256, [0, 256])
plt.plot(red_hist, 'r')
green_hist, _ = np.histogram(green_image, 256, [0, 256])
plt.plot(green_hist, 'g')
blue_hist, _ = np.histogram(blue_image, 256, [0, 256])
plt.plot(blue_hist, 'b')
plt.show()

In [None]:
# 128 bins histogram
red_hist, _ = np.histogram(red_image, 128, [0, 256])
plt.plot(red_hist, 'r')
green_hist, _ = np.histogram(green_image, 128, [0, 256])
plt.plot(green_hist, 'g')
blue_hist, _ = np.histogram(blue_image, 128, [0, 256])
plt.plot(blue_hist, 'b')
plt.show()

In [None]:
# 64 bins histogram
red_hist, _ = np.histogram(red_image, 64, [0, 256])
plt.plot(red_hist, 'r')
green_hist, _ = np.histogram(green_image, 64, [0, 256])
plt.plot(green_hist, 'g')
blue_hist, _ = np.histogram(blue_image, 64, [0, 256])
plt.plot(blue_hist, 'b')
plt.show()

In [None]:
# 64 bins histogram
red_hist, _ = np.histogram(red_image, 64, [0, 256])
green_hist, _ = np.histogram(green_image, 64, [0, 256])
blue_hist, _ = np.histogram(blue_image, 64, [0, 256])
hist = np.concatenate([red_hist, green_hist, blue_hist], axis=0)
plt.plot(hist, 'k')
plt.show()

## 1.4. Histogram Search

In [None]:
def get_histogram_vector(bgr_image, bins=256):
    # B, G, R
    blue_image = bgr_image[:, :, 0]
    green_image = bgr_image[:, :, 1]
    red_image = bgr_image[:, :, 2]
    # R, G, B histogram
    red_hist, _ = np.histogram(red_image, bins, [0, 256])
    green_hist, _ = np.histogram(green_image, bins, [0, 256])
    blue_hist, _ = np.histogram(blue_image, bins, [0, 256])
    hist = np.concatenate([red_hist, green_hist, blue_hist], axis=0)
    return hist

In [None]:
# 0번째 이미지
plt.imshow(cv2.cvtColor(images[0], cv2.COLOR_BGR2RGB))
plt.show()

hist = get_histogram_vector(images[0], bins=64)
plt.plot(hist, 'k')
plt.show()

In [None]:
# 1번째 이미지
plt.imshow(cv2.cvtColor(images[1], cv2.COLOR_BGR2RGB))
plt.show()

hist = get_histogram_vector(images[1], bins=64)
plt.plot(hist, 'k')
plt.show()

In [None]:
# 2번째 이미지
plt.imshow(cv2.cvtColor(images[2], cv2.COLOR_BGR2RGB))
plt.show()

hist = get_histogram_vector(images[2], bins=64)
plt.plot(hist, 'k')
plt.show()

In [None]:
# 5번째 이미지
plt.imshow(cv2.cvtColor(images[5], cv2.COLOR_BGR2RGB))
plt.show()

hist = get_histogram_vector(images[5], bins=64)
plt.plot(hist, 'k')
plt.show()

In [None]:
# make all images to vector
image_vectors = []
for image in images:
    image_vectors.append(get_histogram_vector(image, bins=64))

In [None]:
# cosone 유사도 비교
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
# 검색할 벡터
idx = 0
query_vector = image_vectors[idx]

# cosine 유사도 계산
scores = np.zeros(len(image_vectors))
for i, vector in enumerate(image_vectors):
    scores[i] = cosine_sim(query_vector, vector)
scores

In [None]:
# 높은 스코어 부터 정렬
rank = np.argsort(-scores)
rank

In [None]:
print("query image")
query_image = np.flip(images[idx], 2)
plt.imshow(query_image)
plt.show()
# 상위 top_n 개 출력
for i in rank[:5]:
    print(f"rank {i} image: {scores[i]}")
    query_image = np.flip(images[i], 2)
    plt.imshow(query_image)
    plt.show()
    
    plt.plot(query_vector, 'r', label='query')
    plt.plot(image_vectors[i], 'g', label='image')
    plt.legend()
    plt.show()