In [68]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
import os
import cyvlfeat as vlfeat
from sklearn.svm import LinearSVC, SVC
import os.path as osp
from skimage import filters
from skimage.feature import corner_peaks
from skimage.io import imread
import pickle
from random import shuffle
from scipy.spatial.distance import cdist

from operator import add
from functools import reduce


%matplotlib inline
plt.rcParams['figure.figsize'] = (20.0, 16.0)
plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def im2single(im):
    im = im.astype(np.float32) / 255
    return im

def single2im(im):
    im *= 255
    im = im.astype(np.uint8)
    return im

def load_image(path):
    return im2single(cv2.imread(path))[:, :, ::-1]

def load_image_gray(path):
    img = load_image(path)
    return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

def bags_of_sifts_spm(image_paths, vocab_filename, depth=3):
    """
    Bags of sifts with spatial pyramid matching.

    :param image_paths: paths to N images
    :param vocab_filename: Path to the precomputed vocabulary.
          This function assumes that vocab_filename exists and contains an
          vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid
          or visual word. This ndarray is saved to disk rather than passed in
          as a parameter to avoid recomputing the vocabulary every run.
    :param depth: Depth L of spatial pyramid. Divide images and compute (sum)
          bags-of-sifts for all image partitions for all pyramid levels.
          Refer to the explanation in the notebook, tutorial slide and the 
          original paper (Lazebnik et al. 2006.) for more details.

    :return image_feats: N x d matrix, where d is the dimensionality of the
          feature representation. In this case, d will equal the number of
          clusters (vocab_size) times the number of regions in all pyramid levels,
          which is 21 (1+4+16) in this specific case.
    """
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)
    
    vocab_size = vocab.shape[0]
    feats = []
    weights = [0.25, 0.25, 0.5]


    for path in image_paths:
        img = load_image_gray(path)
        
        H = img.shape[0]
        W = img.shape[1]
        histogram = []
        
        _, descriptors = vlfeat.sift.dsift(img, step=5, fast=True)
        dist = cdist(vocab, descriptors, 'euclidean')
        
        
        # Level Zero
        min_dist_idx = np.argmin(dist, axis = 0)
        hist = np.histogram(min_dist_idx, np.arange(201))[0] * weights[0]

        if np.linalg.norm(histogram) != 0:
            hist = hist / np.linalg.norm(hist)
        
        histogram.extend(hist)
        
        
        # Level 1
        blocks = split(img)
        
        for block in blocks:
            _, descriptors = vlfeat.sift.dsift(block, step=5, fast=True)
            dist = cdist(vocab, descriptors, 'euclidean')
            min_dist_idx = np.argmin(dist, axis = 0)
            hist, _ = np.histogram(min_dist_idx, np.arange(201))
            hist = hist * weights[1]

            if np.linalg.norm(histogram) != 0:
                hist = hist / np.linalg.norm(hist)

            histogram.extend(hist)
            
        
        # Level 2
        
        for block in blocks:
            sub_blocks = split(block)
        
            for sub_block in sub_blocks:
                _, descriptors = vlfeat.sift.dsift(sub_block, step=5, fast=True)
                dist = cdist(vocab, descriptors, 'euclidean')
                min_dist_idx = np.argmin(dist, axis = 0)
                hist, _ = np.histogram(min_dist_idx, np.arange(201))
                hist = hist * weights[2]

                if np.linalg.norm(histogram) != 0:
                    hist = hist / np.linalg.norm(hist)
                
                histogram.extend(hist)

        
        feats.append(histogram)

    return np.array(feats)


def split(arr):
    """Split a matrix into sub-matrices."""

    half_split = np.array_split(arr, 2)

    result = map(lambda x: np.array_split(x, 2, axis=1), half_split)
    result = reduce(add, result)

                
    return result

In [47]:
def bags_of_sifts(image_paths, vocab_filename):

    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = np.zeros((len(image_paths),len(vocab)))


    for i, path in enumerate(image_paths):
        
        image = load_image_gray(path)
        frames, descriptors = vlfeat.sift.dsift(image, step=5, fast=True)
        
        
        dist = cdist(vocab, descriptors, 'euclidean')
        min_dist_idx = np.argmin(dist, axis = 0)
        histogram, _ = np.histogram(min_dist_idx, range(len(vocab)+1))
        
        if np.linalg.norm(histogram) == 0:
            feats[i, :] = histogram
        else:
            feats[i, :] = histogram / np.linalg.norm(histogram)

    return feats


def bags_of_sifts_image(image, vocab_filename):

    # load vocabulary
    with open(vocab_filename, 'rb') as f:
        vocab = pickle.load(f)

    # dummy features variable
    feats = np.zeros((1,len(vocab)))


        
    frames, descriptors = vlfeat.sift.dsift(image, step=5, fast=True)


    dist = cdist(vocab, descriptors, 'euclidean')
    min_dist_idx = np.argmin(dist, axis = 0)
    histogram, _ = np.histogram(min_dist_idx, range(len(vocab)+1))

    if np.linalg.norm(histogram) == 0:
        feats[0, :] = histogram
    else:
        feats[0, :] = histogram / np.linalg.norm(histogram)

    return feats

In [12]:
def build_vocabulary(image_paths, vocab_size):

    dim = 128      # length of the SIFT descriptors that you are going to compute.
    vocab = np.zeros((vocab_size,dim))
    total_SIFT_features = np.zeros((20*len(image_paths), dim))
    
    step_size = 5;
    features = []
    
    for path in image_paths:
        img = load_image_gray(path)
        _, descriptors = vlfeat.sift.dsift(img, step=step_size, fast=True)
        
        descriptors = descriptors[np.random.choice(descriptors.shape[0], 20)]
        features.append(descriptors)
    
    features = np.concatenate(features, axis=0).astype('float64')

    vocab = vlfeat.kmeans.kmeans(features, vocab_size) 
    
        
    return vocab


def svm_classify(train_image_feats, train_labels, test_image_feats):    
    clf = LinearSVC(C=2)
    clf.fit(train_image_feats, train_labels)
    test_labels = clf.predict(test_image_feats)

    return test_labels


def test_accuracy(test_labels, predicted_labels):
    num_correct = 0
    for i,label in enumerate(test_labels):
        if (predicted_labels[i] == label):
            num_correct += 1
    return num_correct/len(test_labels)

All function declaration above 

In [13]:
waldo_train_paths = []
wenda_train_paths = []
wizard_train_paths = []


waldo_test_paths = []
wenda_test_paths = []
wizard_test_paths = []


all_paths = []
test_image_paths = []
train_image_paths = []


with open('datasets/ImageSets/val.txt') as file:
    for img_id in file.readlines():
        img_id = img_id.rstrip()
        test_image_paths.append('datasets/JPEGImages/{}.jpg'.format(img_id))

file.close()

with open('datasets/ImageSets/train.txt') as file:
    for img_id in file.readlines():
        img_id = img_id.rstrip()
        train_image_paths.append('datasets/JPEGImages/{}.jpg'.format(img_id))

file.close()


template_dirs = ["templates/waldo","templates/wenda","templates/wizard"]

for i in range(len(template_dirs)):
    for img_id in os.listdir(template_dirs[i]):
        path_to_dir = os.path.join(template_dirs[i], '{}'.format(img_id)).rstrip()
        if not os.path.isdir(path_to_dir):
            continue
        list_of_files = os.listdir(path_to_dir)
        for file_name in list_of_files:
            all_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==0:
                waldo_train_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==1:
                wenda_train_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==2:
                wizard_train_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
                
template_dirs_test = ["test/templates/waldo","test/templates/wenda","test/templates/wizard"]

for i in range(len(template_dirs_test)):
    for img_id in os.listdir(template_dirs_test[i]):
        path_to_dir = os.path.join(template_dirs_test[i], '{}'.format(img_id)).rstrip()
        if not os.path.isdir(path_to_dir):
            continue
        list_of_files = os.listdir(path_to_dir)
        for file_name in list_of_files:
#             all_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==0:
                waldo_test_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==1:
                wenda_test_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())
            if i==2:
                wizard_test_paths.append(os.path.join(path_to_dir, '{}'.format(file_name)).rstrip())

print(len(all_paths))
print(len(waldo_train_paths))
print(len(wenda_train_paths))
print(len(wizard_train_paths))
print("---")
print(len(waldo_test_paths))
print(len(wenda_test_paths))
print(len(wizard_test_paths))

184
124
36
24
---
13
7
3


In [14]:
# get vocab
print('Using the BAG-OF-SIFT representation for images')

vocab_filename = 'vocab.pkl'

# print('No existing visual word vocabulary found. Computing one from training images')
vocab_size = 200  # Larger values will work better (to a point) but be slower to compute
vocab = build_vocabulary(all_paths, vocab_size)
print(np.isnan(vocab).any())

with open(vocab_filename, 'wb') as f:
    pickle.dump(vocab, f)

    print('{:s} saved'.format(vocab_filename))

Using the BAG-OF-SIFT representation for images
False
vocab.pkl saved


In [20]:
waldo_train_feats = bags_of_sifts(waldo_train_paths, vocab_filename)
wenda_train_feats = bags_of_sifts(wenda_train_paths, vocab_filename)
wizard_train_feats = bags_of_sifts(wizard_train_paths, vocab_filename)

training_feats = []
training_feats.extend(waldo_train_feats)
training_feats.extend(wenda_train_feats)
training_feats.extend(wizard_train_feats)


train_labels = []
train_labels.extend(["waldo"]*len(waldo_train_feats))
train_labels.extend(["wenda"]*len(wenda_train_feats))
train_labels.extend(["wizard"]*len(wizard_train_feats))


print("done train")


waldo_test_feats = bags_of_sifts(waldo_test_paths, vocab_filename)
wenda_test_feats = bags_of_sifts(wenda_test_paths, vocab_filename)
wizard_test_feats = bags_of_sifts(wizard_test_paths, vocab_filename)

test_feats = []
test_feats.extend(waldo_test_feats)
test_feats.extend(wenda_test_feats)
test_feats.extend(wizard_test_feats)


test_labels = []
test_labels.extend(["waldo"]*len(waldo_test_feats))
test_labels.extend(["wenda"]*len(wenda_test_feats))
test_labels.extend(["wizard"]*len(wizard_test_feats))


print("done test")

done train
done test


In [21]:
predicted_labels = svm_classify(training_feats, train_labels, test_feats)

print(test_accuracy(test_labels, predicted_labels))

print(predicted_labels)

0.6956521739130435
['waldo' 'waldo' 'waldo' 'waldo' 'waldo' 'waldo' 'wizard' 'waldo' 'wenda'
 'waldo' 'waldo' 'waldo' 'waldo' 'wenda' 'wenda' 'waldo' 'waldo' 'wenda'
 'waldo' 'wenda' 'waldo' 'waldo' 'wizard']


In [114]:
def svm_probability(train_image_feats, train_labels, test_image_feats):
    
    svc = SVC(C=2, gamma='scale',probability=True)
    svc.fit(train_image_feats, train_labels)
    test_probabilities = svc.predict_proba(test_image_feats)

#     clf = LinearSVC(C=2, probability=True)
#     clf.fit(train_image_feats, train_labels)
#     test_labels = clf.predict(test_image_feats)

    return test_probabilities

In [None]:
def find_characters():
    
    window = 64
    f = open('datasets/ImageSets/val.txt')
    wa = open('baseline_test/waldo.txt', 'w+')
    we = open('baseline_test/wenda.txt', 'w+')
    wi = open('baseline_test/wizard.txt', 'w+')
    
    image_id = f.readline().rstrip()
    while image_id:
#         image_id = "003"
        image = np.asarray(plt.imread('datasets/JPEGImages/' + image_id + '.jpg'))
        H, W, chan = image.shape
        img_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

        test_feats = []

#         orb = cv2.ORB_create()
        orb = cv2.ORB_create(nfeatures=100000, scoreType=cv2.ORB_FAST_SCORE)
        kp, des = orb.detectAndCompute(img_gray, None)

# #         minHessian = 400
# #         detector = cv2.xfeatures2d_SURF.create(hessianThreshold=minHessian)
# #         kp = detector.detect(img_gray)

#             fast = cv2.FastFeatureDetector_create()
#         # find and draw the keypoints
#         kp = fast.detect(img_gray,None)
        img_kp = cv2.drawKeypoints(img_gray, kp, None, color=(0,0,255), flags=cv2.DrawMatchesFlags_DEFAULT)
        
#         plt.figure()
#         plt.imshow(img_kp)
#         plt.show()
        
        for idx in range(len(kp)):
            j,i = kp[idx].pt

            i = int(np.round(i))
            j = int(np.round(j))
            i_end = i+window
            j_end = j+window
            
            i_end = min(i_end, H-1)
            j_end = min(j_end, W-1)

            img = img_gray[i:i_end,j:j_end]
            feats = bags_of_sifts_image(img_gray, vocab_filename)
            test_feats.extend(feats)

            
        numOfMax = 5
        probability = svm_probability(training_feats, train_labels, test_feats)

        locations = np.argpartition(-probability, numOfMax, axis =0)[:numOfMax]
#         print(locations)

        
        for k in range(len(locations[0])):
            for l in range(numOfMax):

                y, x  = kp[locations[l][k]].pt

                x = int(np.round(x))
                y = int(np.round(y))
                y_end = y+window
                x_end = x+window

                x_end = min(x_end, H-1)
                y_end = min(y_end, W-1)

#                 patch = img_gray[x:x_end, y:y_end]
#                 plt.imshow(patch)
#                 plt.show()

                if (probability[locations[l][k]][k] > 0.5):
                    if k == 0:
                        res = image_id + ' ' + str(probability[locations[l][k]][k]) + ' ' + str(x) + ' ' + str(x_end) + ' ' + str(y) + ' ' + str(y_end) + '\n'
#                         print(probability[locations[l][k]][k])
                        wa.write(res)
                    if k == 1:
                        res = image_id + ' ' + str(np.max(probability[locations[l][k]][k])) + ' ' + str(x) + ' ' + str(x_end) + ' ' + str(y) + ' ' + str(y_end) + '\n'
#                         print(res)
                        we.write(res)
                    if k == 2:
                        res = image_id + ' ' + str(np.max(probability[locations[l][k]][k])) + ' ' + str(x) + ' ' + str(x_end) + ' ' + str(y) + ' ' + str(y_end) + '\n'
#                         print(res)
                        wi.write(res)
#         break
        image_id = f.readline().rstrip()



find_characters()

print("done")