In [None]:
%matplotlib inline

In [None]:
from fastai.vision import *
import pandas as pd
import numpy as np
from pathlib import Path
import omicronscala
import spym
import xarray
import os
import torch
import numpy as np
from kmeans_pytorch import kmeans

In [None]:
def savePickle(obj, filename):
    with open('{}.pkl'.format(filename), 'wb') as file:
        pickle.dump(obj, file)
        
def loadPickle(filename):
    with open('{}.pkl'.format(filename), 'rb') as file:
        obj = pickle.load(file)
    return obj

In [None]:
# credits https://github.com/aayushmnit/Deep_learning_explorations
class Hook():
    "Create a hook on `m` with `hook_func`."
    def __init__(self, m:nn.Module, hook_func:HookFunc, is_forward:bool=True, detach:bool=True):
        self.hook_func,self.detach,self.stored = hook_func,detach,None
        f = m.register_forward_hook if is_forward else m.register_backward_hook
        self.hook = f(self.hook_fn)
        self.removed = False

    def hook_fn(self, module:nn.Module, input:Tensors, output:Tensors):
        "Applies `hook_func` to `module`, `input`, `output`."
        if self.detach:
            input  = (o.detach() for o in input ) if is_listy(input) else input.detach()
            output = (o.detach() for o in output) if is_listy(output) else output.detach()
        self.stored = self.hook_func(module, input, output)

    def remove(self):
        "Remove the hook from the model."
        if not self.removed:
            self.hook.remove()
            self.removed=True

    def __enter__(self, *args): return self
    def __exit__(self, *args): self.remove()
        
def get_output(module, input_value, output):
    return output.flatten(1)

def get_input(module, input_value, output):
    return list(input_value)[0]

def get_named_module_from_model(model, name):
    for n, m in model.named_modules():
        if n == name:
            return m
    return None

In [None]:
def get_data_df():
    "create DataFrame for ImageList loader"
    data_df = loadPickle('clean_stm')
    data_df['path'] = [ 'data/train/{}.png'.format(x) for x in data_df.index.values ]
    random = []
    for category in data_df['Categories'].unique():
        if len(category.split(',')) > 1 or category == '':
            random.append(category) 
    category_label = data_df.Categories.astype("category").cat.codes
    dataset = [ 'train' for x in range(len(data_df))]
    category = data_df['Categories']
    data_df['category_label'] = category_label
    data_df['dataset'] = dataset
    data_df['category'] = category
    data_df['is_valid'] = [ True if x.Categories == 'mixed' else False for _,x in data_df.iterrows()]
    data_df = data_df[['path', 'category_label', 'dataset', 'category', 'is_valid']][:]
    return data_df

def get_dict_category_labels(data_df):
    "create dictionary that maps categories with their labels"
    tmp = data_df.groupby(['category','category_label']).size().reset_index().rename(columns={0:'count'})
    ct = tmp['category'].to_list()
    ctl = tmp['category_label'].to_list()
    dict_category_labels = dict(zip(ctl,ct))
    return dict_category_labels

def get_imgs_data(data_df, images_path):
    data_source = ImageList.from_df(df=data_df, path=images_path, cols=['path']).split_from_df(col='is_valid').label_from_df(cols='category_label')
    tmfs = get_transforms()
    data = data_source.transform(tmfs, size=224).databunch(bs=32).normalize(imagenet_stats)
    return data_source, data

In [None]:
def get_img_features_df(linear_output_layer, inference_dataloader):
    img_repr_map = {}

    with Hook(linear_output_layer, get_input, True, True) as hook:
        for i, (xb, yb) in enumerate(inference_dataloader):
            bs = xb.shape[0]
            if bs != 32:
                img_ids = inference_dataloader.items[-bs:]
            else:
                img_ids = inference_dataloader.items[i*bs:(i+1)*bs]
            result = model.eval()(xb)
            img_reprs = hook.stored.cpu().numpy()
            img_reprs = img_reprs.reshape(bs, -1)
            for img_id, img_repr in zip(img_ids, img_reprs):
                img_repr_map[img_id] = img_repr
    
    img_repr_df = pd.DataFrame(img_repr_map.items(), columns=['img_path', 'img_repr'])
    img_repr_df['ID'] = [ x.split('/')[-1].split('.')[0] for x in img_repr_df['img_path'] ]
    img_repr_df.set_index('ID', inplace=True)
    img_repr_df['label'] = [inference_data.classes[x] for x in inference_data.train_ds.y.items[0:img_repr_df.shape[0]]]
    img_repr_df['label_id'] = inference_data.train_ds.y.items[0:img_repr_df.shape[0]]
    return img_repr_df

def torch_save(img_repr_df,fname):
    img_ft_df = img_repr_df.copy()
    img_ft_df['ID'] = [ x.split('/')[-1].split('.')[0] for x in img_repr_df['img_path'] ]
    img_ft_df.set_index('ID', inplace=True)
    img_ft_df.drop(columns=['label', 'label_id'], inplace=True)
    tmp = img_ft_df['img_repr'].to_numpy()
    tmp = np.stack(tmp)
    xx = torch.from_numpy(tmp)
    torch.save(xx, '{}'.format(fname)) 

In [None]:
# prepare dataframe
data_df = get_data_df()

# mapping categories -> labels
dict_category_labels = get_dict_category_labels()
savePickle(dict_category_labels,'labels_dict')

#data folder path
images_path = Path('path_to_data_folder')

# ImageLoader, Images
data_source, data = get_imgs_data(data_df, images_path)

#get resnet pretrained on imagenet
learner = cnn_learner(data, models.resnet50, pretrained=True)
model = learner.model

#select layer for feature extraction
linear_output_layer = get_named_module_from_model(model, '1.4')

#prepare images and dataloader
inference_data = data_source.transform(tmfs, size=224).databunch(bs=32).normalize(imagenet_stats)
inference_dataloader = inference_data.train_dl.new(shuffle=False,drop_last=False)

# get features df
img_repr_df = get_img_features_df(linear_output_layer, inference_dataloader)
savePickle(img_repr_df, "stm_features_df")

# simpler df
df_features = img_repr_df[['img_repr', 'label']]
savePickle(df_features, "stm_df_features")
torch_save(img_repr_df, 'S_features_resnet50_4096')

# categories distribution
len_dict = {}
for k,v in dict_category_labels.items():
    ldf = len(img_repr_df[img_repr_df['label']== k])/len(img_repr_df)
    len_dict[v] = ldf

In [None]:
import time
from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

def get_similar_images(img_index, n=10):
    base_img_id, base_vector, base_label, _  = img_repr_df.loc[str(img_index)][:]
    cosine_similarity = 1 - img_repr_df['img_repr'].apply(lambda x: cosine(x, base_vector))
    similar_img_ids = np.argsort(cosine_similarity)[-n-1:-1][::-1]

    return base_img_id, base_label, img_repr_df.iloc[similar_img_ids]

def get_similar_images_euclidean(img_index, n=10):
    base_img_id, base_vector, base_label, _  = img_repr_df.loc[str(img_index)][:]
    similarity = img_repr_df['img_repr'].apply(lambda x: euclidean(x, base_vector))
    similar_img_ids = np.argsort(similarity)[::][:n]
    return base_img_id, base_label, img_repr_df.iloc[similar_img_ids]

def show_similar_images(similar_images_df):
    images = [open_image(img_id) for img_id in similar_images_df['img_path']]
    categories = [learner.data.train_ds.y.reconstruct(y) for y in similar_images_df['label_id']]
    return learner.data.show_xys(images, categories)

def cosine_stats(df, trials=100, n_imgs=100):
    cstats = {}
    cats = df['label'].unique()
    for c in cats:
        start = time.time()

        c_imgs = []
        c_dfs = []
        rdict = {}
        ldf = len(df[df['label']== c])
        if ldf < trials:
            print('skipping {}: requested {} samples out of {} images'.format(dict_category_labels[c], trials, ldf))
            continue
        tmp_df = df[df['label']== c].sample(trials)
        for i, row in tmp_df.iterrows():
            base_image, base_label, similar_images_df = get_similar_images(i, n_imgs)
            c_imgs.append(i)
            c_dfs.append(similar_images_df)
            results = similar_images_df.groupby('label').size().reset_index(name='N imgs').sort_values(by='N imgs', ascending=False)
            results['label'] = [ dict_category_labels[x] for x in results['label'] ]
            lab = results['label'].to_list()
            nimgs = results['N imgs'].to_list()
            for k,v in zip(lab, nimgs):
                if k not in rdict.keys():
                    rdict[k] = v
                else:
                    rdict[k] += v
        clabel = dict_category_labels[c]
        cstats[c] = {'ID': c_imgs, 'dfs': c_dfs, 'res': rdict, 'label': clabel, 'len': ldf}
        end = time.time()
        print(f'{end - start} secs')
    return cstats

def euclidean_stats(df, trials=100, n_imgs=100):
    cstats = {}
    cats = df['label'].unique()
    for c in cats:
        start = time.time()

        c_imgs = []
        c_dfs = []
        rdict = {}
        ldf = len(df[df['label']== c])
        if ldf < trials:
            print('skipping {}: requested {} samples out of {} images'.format(dict_category_labels[c], trials, ldf))
            continue
        tmp_df = df[df['label']== c].sample(trials)
        for i, row in tmp_df.iterrows():
            base_image, base_label, similar_images_df = get_similar_images_euclidean(i, n_imgs)
            c_imgs.append(i)
            c_dfs.append(similar_images_df)
            results = similar_images_df.groupby('label').size().reset_index(name='N imgs').sort_values(by='N imgs', ascending=False)
            results['label'] = [ dict_category_labels[x] for x in results['label'] ]
            lab = results['label'].to_list()
            nimgs = results['N imgs'].to_list()
            for k,v in zip(lab, nimgs):
                if k not in rdict.keys():
                    rdict[k] = v
                else:
                    rdict[k] += v
        clabel = dict_category_labels[c]
        cstats[c] = {'ID': c_imgs, 'dfs': c_dfs, 'res': rdict, 'label': clabel, 'len': ldf}
        end = time.time()
        print(f'{end - start} secs')
    return cstats

In [None]:
def is_same_date(new_date, start_date):
    if new_date == start_date:
        return True
    return False

def is_same_offset(xoff,yoff,start_xoff, start_yoff,rounded=False):
    if rounded: 
        if (int(xoff) == int(start_xoff)) and (int(yoff) == int(start_yoff)):
            return True
    else:
        if (xoff == start_xoff) and (yoff == start_yoff):
            return True
    return False

def filter_ids(ids, ID, check_off=False, rounded=False):
    good_ones = [int(ID)]
    for i in ids.index.tolist():
        x,y,d = stm.loc[int(i)][["XOffset","YOffset","Date"]].tolist()
        g = 1
        for j in good_ones:
            xj,yj,dj=stm.loc[int(j)][["XOffset","YOffset","Date"]].tolist()
            if is_same_date(d,dj):
                if check_off:
                    if is_same_offset(x,y,xoff,yoff,rounded):
                        g = -1
                else:
                    g = -1
        if g > 0:
            good_ones.append(int(i))
    if len(good_ones)>25:
        good_ones = good_ones[1:25]
    else:
        good_ones = good_ones[1:]
    return good_ones
    

In [None]:
def plot_cosine(df, ID, listID, fsize=8, dpi=40):
    start_img = imgID(df,int(ID))
    imgs = df.loc[df.index.intersection(listID)]
    images = []
    for i, image in imgs.iterrows():
        try:
            images.append(show_img(path,image))
        except Exception as e:
            print(e)
    plt.ioff()
    rows=5
    cols=5
    fig, axs = plt.subplots(rows, cols, figsize=((fsize*cols),(fsize*rows)))
    c = 0
    for i in range(rows):
        for j in range(cols):
            if (i==2) and (j==2):
                start_img[1].plot(ax=axs[i,j], cmap='afmhot', add_colorbar=False )
                axs[i,j].set_title(r"[{}] {} $\bf{{{}}}$".format(start_img[0]['Date'],start_img[0]['TF0_Filename'], start_img[0].name), fontsize=20)
                for item in ([axs[i,j].xaxis.label, axs[i,j].yaxis.label] +
                      axs[i,j].get_xticklabels() + axs[i,j].get_yticklabels()):
                    item.set_fontsize(fsize*2)
            else:
                if c < len(images):
                    images[c][1].plot(ax=axs[i,j], cmap='afmhot', add_colorbar=False )
                    axs[i,j].set_title(r"[{}] {} $\bf{{{}}}$".format(images[c][0]['Date'],images[c][0]['TF0_Filename'], images[c][0].name), fontsize=20)
                    for item in ([axs[i,j].xaxis.label, axs[i,j].yaxis.label] +
                          axs[i,j].get_xticklabels() + axs[i,j].get_yticklabels()):
                        item.set_fontsize(fsize*2)
                    c +=1
    plt.tight_layout()
    plt.draw()
    Path('cosine3/{}'.format(start_img[0]['Categories'])).mkdir(parents=True, exist_ok=True)
    plt.savefig('cosine3/{}/{}.png'.format(start_img[0]['Categories'],start_img[0].name), dpi=dpi)
    plt.close(fig)

In [None]:
def show_img(path, img):
    file = img['ImageOriginalName']
    ds = omicronscala.to_dataset(Path(path+file))
    tf = ds.Z_Forward
    tf.spym.plane()
    tf.spym.align()
    tf.spym.plane()
    tf.spym.fixzero(to_mean=True)
    return [img, tf]

def imgID(df, ID):
    img = df.loc[ID]
    file = img.ImageOriginalName
    ds = omicronscala.to_dataset(Path(path+file))
    tf = ds.Z_Forward
    tf.spym.plane()
    tf.spym.align()
    tf.spym.plane()
    tf.spym.fixzero(to_mean=True)
    return [img, tf]

In [None]:
def plot_cosine_results(stats):
    plt.ioff()
    rows = 4
    cols = 4
    fig, axs = plt.subplots(rows, cols, figsize=(2+(10*cols),10*rows))
    fig.suptitle('Cosine similarity for 100 trials of 100 images for each category', fontsize=36)
    ids = list(stats.keys())
    c = 0
    for i in range(rows):
        for j in range(cols):
            if c < len(ids):
                k = ids[c]
                tk = list(stats[k]['res'].keys())
                tv = list(stats[k]['res'].values())
                tmax = max(tv)
                tv = [x/tmax for x in tv]
                l = stats[k]['label']
                cmap = ['g' if x == l else 'b' for x in tk ]
                axs[i,j].bar(tk, tv, color=cmap)
                axs[i,j].set_title('{}'.format(l), fontsize=28)
                for item in ([axs[i,j].xaxis.label, axs[i,j].yaxis.label] +
                      axs[i,j].get_xticklabels() + axs[i,j].get_yticklabels()):
                    item.set_fontsize(16)
                axs[i,j].set_ylabel('N images')
                for tick in axs[i,j].get_xticklabels():
                    tick.set_rotation(90)
                c += 1
            else:
                axs[i,j].axis('off')
                c += 1
                
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.show()
    fig.savefig('cosine_S_100x100.png')

    
def plot_cosine_results_norm(stats, prefix, trials, n_imgs, len_dict):
    plt.ioff()
    rows = 4
    cols = 4
    fig, axs = plt.subplots(rows, cols, figsize=(2+(10*cols),10*rows))
    fig.suptitle('{} similarity for {} trials of {} images for each category'.format(prefix, trials, n_imgs), fontsize=36)
    ids = list(stats.keys())
    c = 0
    for i in range(rows):
        for j in range(cols):
            if c < len(ids):
                k = ids[c]
                tk = list(stats[k]['res'].keys())
                tv = list(stats[k]['res'].values())
                tp = [ len_dict[x] for x in tk]
                tmax = sum(tv)
                tv = [x/tmax for x in tv]
                l = stats[k]['label']
                cmap = ['g' if x == l else 'b' for x in tk ]
                X = np.arange(len(tk))
                w = 0.3
                axs[i,j].bar(X+0.0, tv, w, color=cmap)
                axs[i,j].bar(X+0.3, tp, w, color='r')
                axs[i,j].set_xticks(X)
                axs[i,j].set_xticklabels(tk)
                axs[i,j].set_title('{}'.format(l), fontsize=28)
                for item in ([axs[i,j].xaxis.label, axs[i,j].yaxis.label] +
                      axs[i,j].get_xticklabels() + axs[i,j].get_yticklabels()):
                    item.set_fontsize(16)
                axs[i,j].set_ylabel('N images')
                for tick in axs[i,j].get_xticklabels():
                    tick.set_rotation(90)
                c += 1
            else:
                axs[i,j].axis('off')
                c += 1
                
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.show()
    fig.savefig('{}_S_{}x{}.png'.format(prefix, trials, n_imgs))
    

In [None]:
#path to original imgs
path = 'path_to_images'

#load stm metadata df
stm = loadPickle('clean_stm')

# good images manually selected
goods_dict = { "N_Gr_Ni111": [87980, 87931, 88019, 87795, 84551, 84568, 87048, 83206, 
87912, 87774], "Gr_Ni111" : [85804, 83502, 83080, 83018, 83005, 82729, 82061, 50736, 50701, 
49062], "Gr_Ni100": [77857, 77795, 77809, 77690, 77696, 79863, 77649, 77626, 79779, 
79729],"NFFA_ID617": [86169, 86687, 86374, 83699, 83734, 84700, 83880, 84133, 
85763, 85800]}

# similarity search with filtering for each image
for k,v in goods_dict.items():
    print('\n{}'.format(k))
    for ID in v:
        print('\t{}'.format(ID))
        _, _, ids = get_similar_images(ID, 250)
        list_ID = filter_ids(ids,ID,check_off=True,rounded=True)
        plot_cosine(stm,ID, list_ID)

In [None]:
#single image similarity example
base_image, base_label, similar_images_df = get_similar_images_euclidean(44148, 100)
print(base_label)
print(base_image)
open_image(base_image)
show_similar_images(similar_images_df)

In [None]:
#features validation by statistical analysis on extracted images from similarity search

stats2 = cosine_stats(img_repr_df, 500, 20)
plot_cosine_results_norm(stats2, 500, 20, len_dict)
savePickle(stats2, 'cosine_500_20')

stats3 = euclidean_stats(img_repr_df, 100, 100)
plot_cosine_results_norm(stats3, 'euclidean', 100, 100, len_dict)
plot_cosine_results_norm(stats3, 100, 100, len_dict)
savePickle(stats3, 'euclidean_100_100')

stats4 = euclidean_stats(img_repr_df, 500, 20)
plot_cosine_results_norm(stats4, 'euclidean', 500, 20, len_dict)
savePickle(stats4, 'euclidean_500_20')

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression

def compute_ID (X):
    N = X.shape[0]
    ngbr= NearestNeighbors(n_neighbors=3, algorithm='kd_tree', n_jobs=-1).fit(X)
    nn_distances, nn_indices = ngbr.kneighbors(X)
    mu = nn_distances[:,2] / nn_distances[:,1]
    i_sorted = np.argsort(mu)
    F_emp = np.zeros(N, dtype=float)
    F_emp[i_sorted] = [i /N for i in range(N)]
    x = np.log(mu).reshape(-1,1)
    y = -np.log(1. - F_emp).reshape(-1,1)
    l = LinearRegression(fit_intercept=False, n_jobs=1).fit(x,y)
    return l.coef_[0,0]

def plot_components(data, model, images=None, ax=None,
                    thumb_frac=0.05, cmap='gray'):
    ax = ax or plt.gca()
    
    proj = model.fit_transform(data)
    ax.plot(proj[:, 0], proj[:, 1], '.k')
    
    if images is not None:
        min_dist_2 = (thumb_frac * max(proj.max(0) - proj.min(0))) ** 2
        shown_images = np.array([2 * proj.max(0)])
        for i in range(data.shape[0]):
            dist = np.sum((proj[i] - shown_images) ** 2, 1)
            if np.min(dist) < min_dist_2:
                # don't show points that are too close
                continue
            shown_images = np.vstack([shown_images, proj[i]])
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(images[i], cmap=cmap),
                                      proj[i])
            ax.add_artist(imagebox)

In [None]:
from sklearn.manifold import Isomap
mod = Isomap(n_components=2)
xx = torch.load('S_features_resnet50_4096')
fig, ax = plt.subplots(figsize=(10, 10))
plot_components(xx, model=mod)

In [None]:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

X = xx
distorsions = []
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 20), distorsions)
plt.grid(True)
plt.title('Elbow curve')

In [None]:
dy2=np.diff(distorsions,n=2)
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 18), dy2)
plt.grid(True)
plt.title('Elbow curve')

In [None]:
dy2=np.diff(np.log(distorsions),n=2)
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 18), dy2)
plt.grid(True)
plt.title('Elbow curve')