In this notebook we:
- Get the distances between each feature vector (using a sample)
- Create graphs with different types of distance matrices (whether you use a top n neighbour approach or a cosine distance threshold)
- Get the dijkstra_path between 2 random nodes using G_top, G_threshold and G_top_threshold networks
- Plot the umap reduced plot of images with the path shown


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm
import os
from io import BytesIO
import ast
import numpy as np
import pickle

from PIL import Image
import torch
import boto3
from scipy.spatial.distance import cdist
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from itertools import combinations
import umap.umap_ as umap

In [None]:
cd ..

In [None]:
from src.network_functions import (
    load_images,
    load_specific_images,
    get_all_s3_keys,
    import_feature_vectors,
    get_distances,
    get_top_neighbours,
    get_high_neighbours,
    get_top_high_neighbours,
    create_graph,
    plot_graph,
    create_network_graph,
    visualise_clusters,
    reduce_data,
    get_random_node_path,
    image_pathway_plot,
    visualize_scatter_with_images,
    reorder_images,
)

In [None]:
# Get all the png image names from the data folder
images_dir = "data/"
image_type = ".png"

image_names = os.listdir(images_dir)
image_names = [os.path.splitext(file)[0] for file in image_names if image_type in file]
len(image_names)

### Create a sample of images to work with (can use all if you want, but takes a long time).
At the moment not all these will have feature vectors in s3

In [None]:
n_sample = 2000
np.random.seed(0)  # For dev
image_name_list = np.random.choice(image_names, n_sample, replace=False)

## Import feature vectors as they are (>4000 dimensions)

In [None]:
bucket_name = "miro-images-feature-vectors"
folder_name = "feature_vectors"

bucket_name = bucket_name
s3 = boto3.client("s3")

In [None]:
feature_vectors, _ = import_feature_vectors(
    s3, bucket_name, folder_name, image_name_list
)

# Remove the name of this image from the list if no feature vector was found for it
image_name_list = [x for x in image_name_list if x in list(feature_vectors.keys())]

In [None]:
len(feature_vectors)

## Look into the cosine distances between feature vectors

In [None]:
dist_mat = get_distances(feature_vectors)

In [None]:
percentile_var = 0.2
p = np.percentile(dist_mat, percentile_var)
print(
    "{}% of the data (number of neighbours".format(percentile_var),
    "for each node will be about {})".format(len(dist_mat) * (percentile_var / 100)),
    "has a cosine distance below {}".format(round(p, 2)),
)
fig = plt.figure(figsize=(10, 5))
plt.hist(dist_mat.flatten(), bins=30)
plt.xlabel("Cosine distance")
plt.ylabel("Frequency")
plt.savefig("cosine_dists.png")
plt.close(fig)
plt.show()

In [None]:
n = 3 # X degrees of separation uses 15, but perhaps this is too much, should it be a fraction of the n_sample?
dist_threshold = 0.17

In [None]:
dist_mat_top = get_top_neighbours(dist_mat, n)

In [None]:
dist_mat_threshold = get_high_neighbours(dist_mat, dist_threshold)

In [None]:
dist_mat_top_threshold = get_top_high_neighbours(dist_mat, n, dist_threshold)

### Create graphs with different types of distance matrices (whether you use a top n neighbour approach or a cosine distance threshold)

In [None]:
G_top = create_network_graph(dist_mat_top)

In [None]:
G_threshold = create_network_graph(dist_mat_threshold)

In [None]:
G_top_threshold = create_network_graph(dist_mat_top_threshold)

In [None]:
print(len(G_top.edges()))
print(len(G_threshold.edges()))
print(len(G_top_threshold.edges()))

In [None]:
figsize = (5, 5)
_ = plot_graph(G_top, figsize=figsize)
_ = plot_graph(G_threshold, figsize=figsize)
_ = plot_graph(G_top_threshold, figsize=figsize)

In [None]:
image_names_dict = {k: v for k, v in enumerate(image_name_list)}

In [None]:
# Get the dijkstra_path between 2 random nodes using G_top_threshold
node_path = get_random_node_path(G_top_threshold, image_names_dict)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
# Get the dijkstra_path between 2 random nodes using G_top
node_path = get_random_node_path(G_top, image_names_dict)
image_pathway_plot(images_dir, image_type, node_path)

In [None]:
# Get the dijkstra_path between 2 random nodes using G_threshold
node_path = get_random_node_path(G_threshold, image_names_dict)
image_pathway_plot(images_dir, image_type, node_path)

## Plot a pathway on the images

In [None]:
x_data = reduce_data(feature_vectors)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=image_name_list,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(20, 20),
    image_zoom=0.15,
    pathway=node_path,
)