In Graph_pathways_comparison I look at a whole host of different parameters, but here I focus on (after deciding they are the best):
- Use feature vectors
- Use a graph made from the top 3 neighbours

And run with more data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm
import os
from io import BytesIO
import ast
import numpy as np
import pickle
from itertools import compress
from collections import Counter
import operator
from functools import partial

from PIL import Image
import torch
import boto3
from scipy.spatial.distance import cdist
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from itertools import combinations
import umap.umap_ as umap

In [None]:
cd ..

In [None]:
from src.network_functions import (
    import_feature_vectors,
    get_top_neighbours,
    get_distances,
    get_all_s3_keys,
    image_pathway_plot,
    plot_graph,
    defined_path,
    reduce_data_nd,
    visualize_scatter_with_images,
    create_network_graph,
)

In [None]:
images_dir = "data/"
image_type = ".png"

### 1. Get the names of the feature vectors I found

Pick a sample if you want to make it quicker

In [None]:
bucket_name = "miro-images-feature-vectors"
bucket_name = bucket_name
s3 = boto3.client("s3")

In [None]:
keys = get_all_s3_keys(bucket_name, s3)

In [None]:
folder_name = "feature_vectors"
image_names = [os.path.split(k)[1] for k in keys if k.split("/")[0] == folder_name]

In [None]:
n_sample = 10000
np.random.seed(0)  # For dev
image_names = np.random.choice(image_names, n_sample, replace=False)

In [None]:
len(image_names)

### 2. Download the feature vectors from S3

In [None]:
feature_vectors, _ = import_feature_vectors(
    s3, bucket_name, "feature_vectors", image_names
)

In [None]:
# Remove the name of this image from the list if no feature vector data was found for it
image_names = [x for x in image_names if x in list(feature_vectors.keys())]

In [None]:
image_names_dict = {k: v for k, v in enumerate(image_names)}

### 3. Make graph

In [None]:
number_neighbours = 3
dist_mat = get_distances(feature_vectors)
dist_mat_neighbours = get_top_neighbours(dist_mat=dist_mat, n=number_neighbours)

In [None]:
G = create_network_graph(dist_mat_neighbours)

### 3. Compare my_defined and dijkstra pathways for the furthest apart nodes

In [None]:
high_coords = np.where(dist_mat == np.amax(dist_mat))
print(
    "Picking the first highest cosine out of {} with the same highest value".format(
        len(high_coords)
    )
)
node1 = list(zip(high_coords[0], high_coords[1]))[0][0]
node2 = list(zip(high_coords[0], high_coords[1]))[0][1]
print(node1)
print(node2)
print(image_names_dict[node1])
print(image_names_dict[node2])

In [None]:
# node1 = np.random.choice(list(image_names_dict))
# node2 = np.random.choice(list(image_names_dict))

In [None]:
def run_pathway(
    G,
    pathway_algo,
    node1,
    node2,
    image_names_dict,
    images_dir,
    image_type,
    path_size=None,
    best_path=True,
    best_type="sum",
):

    try:
        if pathway_algo == nx.dijkstra_path:
            node_path = pathway_algo(G, node1, node2, weight=None)
        elif pathway_algo == nx.astar_path:
            node_path = pathway_algo(G, node1, node2, weight=None)
        elif pathway_algo == defined_path:
            G_weights = nx.to_numpy_matrix(G)
            node_path = pathway_algo(
                G, node1, node2, G_weights, path_size, best_path, best_type
            )

        image_names_path = [image_names_dict[n] for n in node_path]

        title = "Pathway algo is {}.\nBest type is {}".format(
            str(locals()["pathway_algo"]), best_type
        )

        return (
            image_pathway_plot(images_dir, image_type, image_names_path, title),
            node_path,
        )
    except:
        return print("There is no pathway between nodes"), _

In [None]:
run_defined_pathway_partial = partial(
    run_pathway,
    pathway_algo=defined_path,
    node1=node1,
    node2=node2,
    image_names_dict=image_names_dict,
    images_dir=images_dir,
    image_type=image_type,
)

run_dijk_pathway_partial = partial(
    run_pathway,
    pathway_algo=nx.dijkstra_path,
    node1=node1,
    node2=node2,
    image_names_dict=image_names_dict,
    images_dir=images_dir,
    image_type=image_type,
)

In [None]:
(_, node_path_dijk) = run_dijk_pathway_partial(G)
(_, node_path_sum) = run_defined_pathway_partial(G, path_size=12, best_type="sum")
(_, node_path_var) = run_defined_pathway_partial(G, path_size=12, best_type="variance")

In [None]:
[image_names_dict[n] for n in node_path_dijk]

In [None]:
pos = plot_graph(G, figsize=(8, 8), node_list=node_path_dijk)
_ = plot_graph(G, figsize=(8, 8), node_list=node_path_sum, pos=pos)
_ = plot_graph(G, figsize=(8, 8), node_list=node_path_var, pos=pos)

In [None]:
x_data = reduce_data_nd(feature_vectors)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=image_names,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(20, 20),
    image_zoom=0.1,
    pathway=[image_names_dict[n] for n in node_path_dijk],
)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=image_names,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(20, 20),
    image_zoom=0.1,
    pathway=[image_names_dict[n] for n in node_path_sum],
)

In [None]:
visualize_scatter_with_images(
    x_data,
    image_name_list=image_names,
    images_dir=images_dir,
    image_type=image_type,
    figsize=(20, 20),
    image_zoom=0.1,
    pathway=[image_names_dict[n] for n in node_path_var],
)