# Color similarity extraction

This script extracts color similarity for each test instance, rank them and put together to a `.csv` file.

In [1]:
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

In [2]:
train_i = {}
test_i = {}

train_xy = pd.read_csv('train.csv')
train_path = train_xy['id'].values
test_path = pd.read_csv('imagenames.csv')['id'].values

# Only store the histogram to save up space
for f in train_path:
    i = cv2.imread('./train/' + f + '.jpg')
    hist = cv2.calcHist([i],[0],None,[256],[0,256])
    train_i[f] = hist
for f in test_path:
    i = cv2.imread('./test/' + f + '.jpg')
    hist = cv2.calcHist([i],[0],None,[256],[0,256])
    test_i[f] = hist

In [4]:
n_trains = len(train_xy)

fname = np.empty((0,n_trains))
match = np.empty((0,n_trains))

for test in tqdm(test_path):
    matches = [cv2.compareHist(train_i[train],test_i[test],cv2.HISTCMP_INTERSECT) 
               for train in train_path]
    # Sorting. The order starts from the best matches
    sorted_idx = np.flip(np.argsort(matches))
    sorted_path = [train_path[idx] for idx in sorted_idx]
    sorted_matches = [matches[idx] for idx in sorted_idx]

    fname = np.append(fname, [sorted_path], axis=0)
    match = np.append(match, [sorted_matches], axis=0)

100%|██████████| 1200/1200 [04:45<00:00,  4.20it/s]


In [5]:
pd.DataFrame(fname).to_csv('color_matches_filename.csv',index=False)
pd.DataFrame(match).to_csv('color_matches_distance.csv',index=False)