In [1]:
from glob import glob
import random
import math
from tqdm import tqdm

import numpy as np
import ipyplot
from PIL import Image
from sklearn.cluster import KMeans
from itertools import compress
import pickle
import pandas as pd
import cv2
import ipyplot
import gensim
from ast import literal_eval
import pathlib

from MoCoFeatureExtractor import MoCoFeatureExtractor

cpu


### Parameters and Utility methods for extracting patches

In [2]:
version = 'c'

window_size_outer = 288
window_size_inner = 144

stride = 72

outer_clusters = 100
inner_clusters = 30

word_format_string = '{:04d}'

walk_length = 10
walks_per_image = 100

cluster_patches_per_image = 24

image_scale = 1

feature_dim = 2048

cluster_file = f'clusters_{window_size_outer}_{window_size_inner}_{stride}_{version}.pkl'
image_cluster_grid_file = f'image_cluster_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}.npy'
sequences_file = f'sequences_{window_size_outer}_{window_size_inner}_{stride}_{version}.csv'
doc2vec_file = f'doc2vec_{window_size_outer}_{window_size_inner}_{stride}_{version}.model'

cnn = MoCoFeatureExtractor(params_path='/data/moco_v2_800ep_pretrain.pth.tar')

image_files = glob("/data/dataset_1000/train/*/*.jpg")

image_id_to_class = dict([(f.split('/')[-1].split('.')[0], f.split('/')[-2]) for f in image_files])

def extract_windows(frame, pos, window_size):
    windows = np.empty((len(pos), window_size, window_size, 3), dtype=np.uint8)

    for i in range(len(pos)):
        windows[i] = extract_window(frame, pos[i], window_size)

    return windows


def extract_window(frame, pos, window_size):
    half_w = window_size/2.0

    top_left = [int(round(pos[0]-half_w)), int(round(pos[1]-half_w))]
    bottom_right = [top_left[0]+window_size, top_left[1]+window_size]

    return frame[top_left[0]:bottom_right[0], top_left[1]:bottom_right[1]]



def get_rad_grid(grid_pos, rad, grid_shape):

    top_left = (grid_pos[0]-rad, grid_pos[1]-rad)

    res = []

    for i in range(2*rad+1):
        p = (top_left[0]+i, top_left[1])
        if p[0] >= 0 and p[1] >= 0 and p[0] < grid_shape[0] and p[1] < grid_shape[1]:
            res.append(p)
 
    for i in range(2*rad+1):
        p = (top_left[0]+i, top_left[1]+(2*rad))
        if p[0] >= 0 and p[1] >= 0 and p[0] < grid_shape[0] and p[1] < grid_shape[1]:
            res.append(p)

    for i in range(2*rad-1):
        p = (top_left[0], top_left[1]+(i+1))
        if p[0] >= 0 and p[1] >= 0 and p[0] < grid_shape[0] and p[1] < grid_shape[1]:
            res.append(p)

    for i in range(2*rad-1):
        p = (top_left[0]+(2*rad), top_left[1]+(i+1))
        if p[0] >= 0 and p[1] >= 0 and p[0] < grid_shape[0] and p[1] < grid_shape[1]:
            res.append(p)

    return res



def next_pos(salient_grid_positions, grid_shape, current_position):
    
    if current_position is not None:

        rad_grid = get_rad_grid(current_position, 1, grid_shape)

        # print('rad_grid', current_position, rad_grid)
        
        if len(rad_grid) == 0:
            print("frame empty?")
            
        else:
            random.shuffle(rad_grid)
            for loc in rad_grid:
                if loc in salient_grid_positions:
                    return loc
    
    return random.sample(salient_grid_positions,1)[0]

### Compute features for all images

In [None]:
for idx, image_file in tqdm(enumerate(image_files), total=len(image_files)):
    
    pil_image = Image.open(image_file).convert('RGB')
    pil_image = pil_image.resize((int(round(pil_image.size[0] * image_scale)), int(round(pil_image.size[1] * image_scale))))
    image = np.array(pil_image)
    
    if image.shape[0] < window_size_outer * 2 or image.shape[1] < window_size_outer * 2 or image.shape[0] > 1024 or image.shape[1] > 1024:
        continue

    margin = window_size_outer-stride
    grid_shape = (math.floor((image.shape[0] - margin) / stride), math.floor((image.shape[1] - margin) / stride))
    offsets = (round((image.shape[0] - grid_shape[0] * stride)/2), round((image.shape[1] - grid_shape[1] * stride)/2))

    points = [(offsets[0]+y*stride+stride/2,offsets[1]+x*stride+stride/2) for y in range(grid_shape[0]) for x in range(grid_shape[1])]

    patches_outer = extract_windows(image, points, window_size_outer)
    windows_outer = patches_outer.astype(np.float64)
    
    patches_inner = extract_windows(image, points, window_size_inner)
    windows_inner = patches_inner.astype(np.float64)
    
    try:
        feats_outer = cnn.evalRGB(windows_outer)
        feats_inner = cnn.evalRGB(windows_inner)
    except:
        print("ERROR cnn.evalRGB", image, image.shape, windows_outer.shape, windows_inner.shape)
        raise

    feat_grid_outer = feats_outer.reshape((grid_shape[0], grid_shape[1], feature_dim))
    feat_grid_inner = feats_inner.reshape((grid_shape[0], grid_shape[1], feature_dim))
    
    path_parts = image_file.split('/')
    image_id = path_parts[-1].split('.')[0]
    image_class = path_parts[-2]
    
    pathlib.Path(f'feat_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}/{image_class}').mkdir(parents=True, exist_ok=True)
    np.savez_compressed(f'feat_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}/{image_class}/{image_id}.npz', outer=feat_grid_outer, inner=feat_grid_inner)

## Cluster Outer Clusters

In [None]:
npz_files = glob(f'/data/feat_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}/*/*.npz')

X = []

for idx, npz_file in tqdm(enumerate(npz_files), total=len(npz_files)):

    loaded = np.load(npz_file)

    feat_grid_outer = loaded['outer']

    grid_locs = [(y,x) for y in range(feat_grid_outer.shape[0])for x in range(feat_grid_outer.shape[1])]
    
    grid_locs = random.sample(grid_locs, cluster_patches_per_image) if len(grid_locs) > cluster_patches_per_image else grid_locs
    
    for l in grid_locs:
        X.append(feat_grid_outer[l])

    

print("Clustering with KMeans: len(X)", len(X))

clusters = KMeans(n_clusters=outer_clusters, verbose=False)
clusters.fit(np.array(X, dtype=np.float32))

pickle.dump(clusters, open(cluster_file, "wb"))

print("done")

### Cluster Inner Clusters

In [None]:
clusters = pickle.load(open(cluster_file, "rb"))

npz_files = glob(f'/data/feat_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}/*/*.npz')

sub_clusters = [KMeans(n_clusters=inner_clusters) for _ in range(outer_clusters)]

assert(outer_clusters % 20 == 0)

for c in range(int(outer_clusters/20)):
    Y = [[] for _ in range(20)]
    
    for idx, npz_file in tqdm(enumerate(npz_files), total=len(npz_files)):
        
        loaded = np.load(npz_file)
        feat_grid_outer = loaded['outer']
        feat_grid_inner = loaded['inner']

        grid_cluster_ids = clusters.predict(feat_grid_outer.reshape(feat_grid_outer.shape[0]*feat_grid_outer.shape[1], feat_grid_outer.shape[2]))
        grid_cluster_ids = grid_cluster_ids.reshape((feat_grid_outer.shape[0], feat_grid_outer.shape[1]))
        
        for y in range(feat_grid_outer.shape[0]): 
            for x in range(feat_grid_outer.shape[1]): 
                gcid = grid_cluster_ids[y,x]
                if gcid >= 20*c and gcid < 20*c+20:
                    Y[gcid%20].append(feat_grid_inner[y,x])
    
    for d in range(20):
        print(len(Y[d]))
        sub_clusters[c*20 + d].fit(np.array(Y[d], dtype=np.float32))
        pickle.dump(sub_clusters, open(f'sub_clusters/sub_cluster_{c*20 + d}.pkl', "wb"))

        
pickle.dump(sub_clusters, open("sub_clusers.pkl", "wb"))

100%|██████████| 15112/15112 [03:09<00:00, 79.65it/s]


18218
9904
22589


### Generate Sequences

In [None]:
clusters_outer = pickle.load(open(cluster_file, "rb"))
clusters_inner = pickle.load(open("sub_clusers.pkl", "rb"))
     
npz_files = glob(f'/data/feat_grids_{window_size_outer}_{window_size_inner}_{stride}_{version}/*/*.npz')

def generate_image_sequences(npz_file):

    loaded = np.load(npz_file)
    
    feat_grid_outer = loaded['outer']
    feat_grid_inner = loaded['inner']
    
    grid_cluster_ids_outer = clusters_outer.predict(feat_grid_outer.reshape(feat_grid_outer.shape[0]*feat_grid_outer.shape[1], feat_grid_outer.shape[2]))
    grid_cluster_ids_outer = grid_cluster_ids_outer.reshape((feat_grid_outer.shape[0], feat_grid_outer.shape[1]))

    grid_cluster_ids = np.empty(grid_cluster_ids_outer.shape, dtype=grid_cluster_ids_outer.dtype)
    
    # print(grid_cluster_ids.shape, grid_cluster_ids.dtype,grid_cluster_ids_outer.dtype)
   
    
    for y in range(grid_cluster_ids_outer.shape[0]):
        for x in range(grid_cluster_ids_outer.shape[1]):
            outer_cluster = grid_cluster_ids_outer[y,x]
            innter_cluster = clusters_inner[int(outer_cluster)].predict(feat_grid_inner[y,x].reshape(1, -1))[0]
            grid_cluster_ids[y,x] = outer_cluster*inner_clusters + innter_cluster
    
    seqs = []
    
    grid_locations_set = set([(y,x) for y in range(grid_cluster_ids.shape[0]) for x in range(grid_cluster_ids.shape[1])])
    
    for i in range(walks_per_image):
        cluster_seq = []

        pos = None
        
        for t in range(walk_length):
            pos = next_pos(grid_locations_set, grid_cluster_ids.shape, pos)            
            cluster_seq.append(grid_cluster_ids[pos])
        
        # print(cluster_seq)
        
        seqs.append([word_format_string.format(w) for w in cluster_seq])
        
    return seqs


cluster_seqs = []
image_file_colummn = []

for idx, npz_file in tqdm(enumerate(npz_files), total=len(npz_files)):
    c_seqs = generate_image_sequences(npz_file)
    if c_seqs is None:
        continue
    cluster_seqs.extend(c_seqs)
    image_file_colummn.extend([npz_file] * walks_per_image)
    
data_frame = pd.DataFrame({'words':cluster_seqs, 'file':image_file_colummn})

data_frame.to_csv(sequences_file)

print("done", len(cluster_seqs))

In [None]:
class callback(gensim.models.callbacks.CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print('epoch {}'.format(self.epoch))
        self.epoch += 1
              
def read_corpus(fname, tokens_only=False):
    data_frame = pd.read_csv(sequences_file,converters={"words": literal_eval})
    
    for index, row in data_frame.iterrows():
        if tokens_only:
            yield row['words']
        else:
            yield gensim.models.doc2vec.TaggedDocument(row['words'], [index])

train_corpus = list(read_corpus(sequences_file))
print(train_corpus[:2])

model = gensim.models.doc2vec.Doc2Vec(vector_size=256, epochs=30)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs, callbacks=[callback()])

model.save(doc2vec_file)

print("done")

In [None]:
def mask_locations(mask, stride, grid_shape, offsets):
    
    object_grid_locations = set()

    for y in range(grid_shape[0]):
        for x in range(grid_shape[1]):
            p = (offsets[0] + y * stride + 0.5 * stride, offsets[1] + x * stride + 0.5 * stride)
            w = extract_window(mask, p, stride)

            # print(np.sum(w))
            if np.sum(w) >= stride * stride * 0.3:
                object_grid_locations.add((y, x))

    return object_grid_locations

     
def generate_masked_image_sequences(image_file, mask_file, clusters_outer, clusters_inner, feature_extractor, seq_count=walks_per_image):

    pil_image = Image.open(image_file).convert('RGB')
    pil_image = pil_image.resize((int(round(pil_image.size[0] * image_scale)), int(round(pil_image.size[1] * image_scale))))
    image = np.array(pil_image)

    if image.shape[0] < window_size_outer * 2 or image.shape[1] < window_size_outer * 2:
        print("image too small, image_file")
        return None
            
    pil_mask = Image.open(mask_file).convert('1')
    pil_mask = pil_mask.resize((int(round(pil_mask.size[0] * image_scale)), int(round(pil_mask.size[1] * image_scale))))
    mask = np.array(pil_mask)
        
    margin = window_size_outer-stride
    grid_shape = (math.floor((image.shape[0] - margin) / stride), math.floor((image.shape[1] - margin) / stride))
    offsets = (round((image.shape[0] - grid_shape[0] * stride)/2), round((image.shape[1] - grid_shape[1] * stride)/2))

    grid_locations_set = mask_locations(mask, stride, grid_shape, offsets)
    grid_locations_list = list(grid_locations_set)
    
    points = [(y*stride + stride/2 + offsets[0], x*stride + stride/2 + offsets[1]) for (y,x) in grid_locations_list]
        
    patches_outer = extract_windows(image, points, window_size_outer)
    windows_outer = patches_outer.astype(np.float64)
    
    patches_inner = extract_windows(image, points, window_size_inner)
    windows_inner = patches_inner.astype(np.float64)
    
    try:
        feats_outer = cnn.evalRGB(windows_outer)
        feats_inner = cnn.evalRGB(windows_inner)
    except:
        print("ERROR cnn.evalRGB", image, image.shape, windows_outer.shape, windows_inner.shape)
        raise

    feats_outer = feats_outer.reshape((len(points), feature_dim))
    feats_inner = feats_inner.reshape((len(points), feature_dim))
    
    #print(feats_outer.shape, feats_inner.shape)

    #clusters_outer_predict = clusters_outer.predict(feats_outer)
    
    grid_cluster_ids_outer = clusters_outer.predict(feats_outer)
    #dict([(grid_locations_list[i], clusters_outer_predict[i]) for i in range(len(grid_locations_list))])
    grid_cluster_ids = {}
    
    for idx in range(len(grid_locations_list)):
        outer_cluster = grid_cluster_ids_outer[idx]
        innter_cluster = clusters_inner[int(outer_cluster)].predict(feats_inner[idx].reshape(1, -1))[0]
        loc = grid_locations_list[idx]
        grid_cluster_ids[loc] = outer_cluster*inner_clusters + innter_cluster
            
    #print('grid_cluster_ids', grid_cluster_ids)
    cluster_seqs = []
    for i in range(seq_count):
        cluster_seq = []
        
        pos = None
        
        for t in range(walk_length):
            pos = next_pos(grid_locations_set, grid_shape, pos)
            cluster_seq.append(grid_cluster_ids[pos])
            
        cluster_seqs.append([word_format_string.format(w) for w in cluster_seq])
  
    return cluster_seqs

### Section Title

In [None]:
clusters_outer = pickle.load(open(cluster_file, "rb"))
clusters_inner = pickle.load(open("sub_clusers.pkl", "rb"))

model = gensim.models.doc2vec.Doc2Vec.load(doc2vec_file)

data_frame = pd.read_csv(sequences_file, converters={"words": literal_eval})

test_image_files = glob("/data/dataset_100/test/*/*.jpg")
test_mask_files = glob("/data/dataset_100/test/*/*.mask.png")

test_image_files.sort()
test_mask_files.sort()


correct = 0
total = 0

for i in range(len(test_image_files)):
    
    image_correct = 0
    image_total = 0
    
    image_file = test_image_files[i]
    mask_file = test_mask_files[i]

    print("test", image_file)

    c_seqs = generate_masked_image_sequences(image_file, mask_file, clusters_outer, clusters_inner, cnn, seq_count=100)

    if c_seqs is None:
        continue
        
    vectors = [[model.infer_vector(s)] for s in c_seqs]

    for idx, v in tqdm(enumerate(vectors), total=len(vectors)):
        similar = model.docvecs.most_similar(v, topn=10)
        #print('similar', similar)

        for s in similar:
            f = data_frame.loc[s[0],'file']
            
            a = image_file.split('/')[-2]
            b = f.split('/')[-2]
           
            #print(a, b)
            
            if a == b:
                image_correct += 1
                correct += 1
           
            image_total += 1
            total += 1
            
    print("score", image_correct/image_total)
    
print("final score", correct/total)