In this notebook we:
- Load all the feature vectors from the data folder
- Make some pathways using `get_pathway` which is a optimised version of `fv_spaced_pathway_nD` and is the function used in the API code
- Have a look at some pairs of images with a range of different distances between
- Look at a sample of the images plotted in the reduced feature space, and plot a pathway

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from tqdm import tqdm

from PIL import Image
import matplotlib.pyplot as plt
from matplotlib import gridspec
from scipy.spatial.distance import cdist

In [None]:
cd ..

In [None]:
from src.network_functions import (
    get_pathway,
    image_pathway_plot,
    image_pathway_scaled_plot,
    reduce_data_nd,
    visualize_scatter_with_images,
)

## Get the feature vectors for all the images

In [None]:
feature_vectors_ids = np.load("data/20190822_feature_vectors_ids.npy")
feature_vectors = np.load("data/20190822_feature_vectors.npy")

In [None]:
len(feature_vectors_ids)

In [None]:
images_dir = "data/images/"
image_type = ".png"

## Look at some random pathways using all the images

In [None]:
nodes_used = []
all_path_dists = []
n_nodes = 10
sample_size = None
for i in tqdm(range(0, 10)):
    id_1 = np.random.choice(feature_vectors_ids)
    id_2 = np.random.choice(feature_vectors_ids)
    node_path, path_dists = get_pathway(
        feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
    )
    print(node_path)
    image_pathway_plot(images_dir, image_type, node_path)

    nodes_used.append([id_1, id_2])
    all_path_dists.extend(path_dists)

In [None]:
nodes_used

In [None]:
nodes_used = []
all_path_dists = []
n_nodes = 10
sample_size = None
for i in tqdm(range(0, 10)):
    id_1 = np.random.choice(feature_vectors_ids)
    id_2 = np.random.choice(feature_vectors_ids)
    node_path, path_dists = get_pathway(
        feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
    )
    print(node_path)
    image_pathway_plot(images_dir, image_type, node_path)

    nodes_used.append([id_1, id_2])
    all_path_dists.extend(path_dists)

In [None]:
nodes_used

In [None]:
plt.hist(all_path_dists)
plt.show()

## Pick particular pathways

In [None]:
id_1 = "B0006448"
id_2 = "V0021276"

n_nodes = 8
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_scaled_plot(images_dir, image_type, node_path)

In [None]:
id_1 = "B0006448"
id_2 = "V0021276"

n_nodes = 8
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
id_1 = "B0008895"
id_2 = "M0010374"  #'V0005248'#'V0006023'#'V0001893'

n_nodes = 9
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
id_1 = "V0046313"
id_2 = "L0061460"

n_nodes = 9
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
id_1 = "V0001893"
id_2 = "V0047369EL"

n_nodes = 9
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
id_1 = "A0000001"
id_2 = "B0006893"

n_nodes = 8
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
node_path

In [None]:
id_1 = "B0002621"
id_2 = "V0010033"

n_nodes = 10
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
node_path

In [None]:
id_1 = "A0000785"
id_2 = "V0040933"

n_nodes = 8
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
node_path

In [None]:
id_1 = 'V0044783'
id_2 = 'V0023117

n_nodes = 7
sample_size = None

node_path, path_dists = get_pathway(feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
node_path

In [None]:
id_1 = "V0044783"
id_2 = "V0046793"

n_nodes = 8
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
)
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

## Get all the distances for a sample
- Plot a very different pair (dist > 373)
- Plot a very similar pair (dists<13) & (dists!=0)
- Plot a quite similar pair (dists<46) & (dists>45.999) : These numbers are typical distances in the pathway finder

In [None]:
sample_size = 10000

In [None]:
np.random.seed(0)
rand_args = np.random.choice(range(0, len(feature_vectors)), sample_size, replace=False)
feature_vectors_sample = feature_vectors[rand_args]
feature_vectors_ids_sample = feature_vectors_ids[rand_args]

In [None]:
dists = cdist(feature_vectors_sample, feature_vectors_sample)

In [None]:
all_dists = dists.flatten()

In [None]:
plt.hist(all_dists, bins=50)
plt.show()

In [None]:
# Get pairs with distances from an equally spaced dist, and plot in order of most similar to least
ideal_dists = np.linspace(min(all_dists), max(all_dists), num=10)
# ideal_dists = np.linspace(40, 50, num = 10)

In [None]:
ideal_dists

In [None]:
for ideal_dist in ideal_dists:
    pairs = feature_vectors_ids_sample[
        np.argwhere((dists < (ideal_dist + 0.1)) & (dists > (ideal_dist - 0.1)))
    ]
    index = np.random.choice(pairs.shape[0])
    pair = pairs[index]
    print(pair)

    fig = plt.figure(figsize=(20, 10))
    columns = len(pair)
    for i, image_name in enumerate(pair):
        image = Image.open(images_dir + image_name + image_type)
        ax = plt.subplot(2, columns, i + 1)
        ax.set_axis_off()
        plt.imshow(image)
        image.close()

## Get a few samples of pairs of images with high, low and quite low distances between 

In [None]:
# Remove duplicate [a, b] and [b, a] should be the same
v_different_pairs = feature_vectors_ids_sample[np.argwhere(dists > 373)]
v_different_pairs = np.unique(np.sort(v_different_pairs), axis=0)
v_different_pairs.shape

In [None]:
for figs in v_different_pairs:
    fig = plt.figure(figsize=(20, 10))
    columns = len(figs)
    for i, image_name in enumerate(figs):
        image = Image.open(images_dir + image_name + image_type)
        ax = plt.subplot(2, columns, i + 1)
        ax.set_axis_off()
        plt.imshow(image)
        image.close()

In [None]:
v_similar_pairs = feature_vectors_ids_sample[np.argwhere((dists < 13) & (dists != 0))]
v_similar_pairs = np.unique(np.sort(v_similar_pairs), axis=0)
v_similar_pairs.shape

In [None]:
v_similar_pairs

In [None]:
for figs in v_similar_pairs:
    fig = plt.figure(figsize=(5, 2))
    columns = len(figs)
    for i, image_name in enumerate(figs):
        image = Image.open(images_dir + image_name + image_type)
        ax = plt.subplot(2, columns, i + 1)
        ax.set_axis_off()
        plt.imshow(image)
        image.close()

In [None]:
quite_similar_pairs = feature_vectors_ids_sample[
    np.argwhere((dists < 35) & (dists > 34.99))
]
quite_similar_pairs = np.unique(np.sort(quite_similar_pairs), axis=0)
quite_similar_pairs.shape

In [None]:
quite_similar_pairs

In [None]:
for figs in quite_similar_pairs:
    fig = plt.figure(figsize=(20, 10))
    columns = len(figs)
    for i, image_name in enumerate(figs):
        image = Image.open(images_dir + image_name + image_type)
        ax = plt.subplot(2, columns, i + 1)
        ax.set_axis_off()
        plt.imshow(image)
        image.close()

## Plot the whole (well from the sample) image space with the pathway

In [None]:
np.random.choice(feature_vectors_ids_sample, 10)

In [None]:
id_1 = "V0001893"
id_2 = "V0047369EL"

n_nodes = 10
sample_size = None

node_path, path_dists = get_pathway(
    feature_vectors_ids_sample, feature_vectors_sample, id_1, id_2, n_nodes, sample_size
)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
node_path

In [None]:
feature_vectors_sample_trans = {
    k: list(v) for k, v in zip(feature_vectors_ids_sample, feature_vectors_sample)
}

In [None]:
x_data = reduce_data_nd(feature_vectors_sample_trans)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=feature_vectors_ids_sample,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(30, 20),
    image_zoom=0.1,
    pathway=node_path,
)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=feature_vectors_ids_sample,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(30, 20),
    image_zoom=0.13,
)

## Add extra images to the sample if there are specific images you want

In [None]:
def add_new_images(
    id_1,
    id_2,
    feature_vectors_ids,
    feature_vectors,
    feature_vectors_ids_sample,
    feature_vectors_sample,
    n_nodes,
):

    sample_size = None

    node_path, path_dists = get_pathway(
        feature_vectors_ids, feature_vectors, id_1, id_2, n_nodes, sample_size
    )

    node_path_extra = [f for f in node_path if f not in feature_vectors_ids_sample]

    node_path_extra_index = [
        i for i, fv_id in enumerate(feature_vectors_ids) if fv_id in node_path_extra
    ]

    feature_vectors_ids_sample_extra = np.concatenate(
        (
            feature_vectors_ids_sample,
            np.array(feature_vectors_ids[node_path_extra_index]),
        )
    )
    feature_vectors_sample_extra = np.concatenate(
        (feature_vectors_sample, feature_vectors[node_path_extra_index])
    )

    feature_vectors_sample_extra_trans = {
        k: list(v)
        for k, v in zip(feature_vectors_ids_sample_extra, feature_vectors_sample_extra)
    }

    x_data = reduce_data_nd(feature_vectors_sample_extra_trans)

    return node_path, x_data, feature_vectors_ids_sample_extra

In [None]:
node_path, x_data, feature_vectors_ids_sample_extra = add_new_images(
    "V0044783",
    "V0046793",
    feature_vectors_ids,
    feature_vectors,
    feature_vectors_ids_sample,
    feature_vectors_sample,
    8,
)

In [None]:
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=feature_vectors_ids_sample_extra,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(30, 20),
    image_zoom=0.1,
    pathway=node_path,
)

In [None]:
node_path, x_data, feature_vectors_ids_sample_extra = add_new_images(
    "B0008895",
    "M0010374",
    feature_vectors_ids,
    feature_vectors,
    feature_vectors_ids_sample,
    feature_vectors_sample,
    9,
)

In [None]:
print(node_path)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=feature_vectors_ids_sample_extra,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(30, 20),
    image_zoom=0.1,
    pathway=node_path,
)