In this notebook we:
- Load feature vectors
- Use fv_spaced_pathway_nD to find the pathways in reduced data using different umap.UMAP parameters to reduce the data
- Use fv_spaced_pathway_nD to find the pathways in original feature vectors

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm
import os
from io import BytesIO
import ast
import numpy as np
import pickle
from itertools import compress
from collections import Counter
import operator

from PIL import Image
import torch
import boto3
from scipy.spatial.distance import cdist
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from itertools import combinations
import umap.umap_ as umap

In [None]:
cd ..

In [None]:
from src.network_functions import (
    import_feature_vectors,
    image_pathway_plot,
    reduce_data_nd,
    fv_spaced_pathway_nD,
)

In [None]:
# Get all the png image names from the data folder
images_dir = "data/"
image_type = ".png"

image_names = os.listdir(images_dir)
image_names = [os.path.splitext(file)[0] for file in image_names if image_type in file]
len(image_names)

### Create a sample of images to work with

In [None]:
n_sample = 1000
np.random.seed(0)  # For dev
image_name_list = np.random.choice(image_names, n_sample, replace=False)

### 1. Get feature vectors as they are (>4000 dimensions)

In [None]:
bucket_name = "miro-images-feature-vectors"
folder_name = "feature_vectors"
n = 3 # This is what X degrees of separation uses 15, but perhaps this is too much, should it be a fraction of the n_sample?
dist_threshold = 0.35

bucket_name = bucket_name
s3 = boto3.client("s3")

In [None]:
feature_vectors, _ = import_feature_vectors(
    s3, bucket_name, folder_name, image_name_list
)

# Remove the name of this image from the list if no feature vector was found for it
image_name_list = [x for x in image_name_list if x in list(feature_vectors.keys())]

In [None]:
len(feature_vectors)

In [None]:
image_names_dict = {k: v for k, v in enumerate(image_name_list)}

### 2. Experimenting with the umap.UMAP parameters
- n_components
- min_dist
- n_neighbors

In [None]:
for n_neighbors in (2, 50, 200):
    for min_dist in (0.1, 0.5, 0.9):
        for n_components in (2, 3):
            x_data = reduce_data_nd(
                feature_vectors, n_components, n_neighbors, min_dist
            )

            node1 = 100
            node2 = 30
            n_nodes = 6

            node_path = fv_spaced_pathway_nD(x_data, node1, node2, n_nodes)
            image_pathway_plot(
                images_dir,
                image_type,
                node_path,
                title="{} components, {} min dist, {} neighbors".format(
                    n_components, min_dist, n_neighbors
                ),
            )

### 3. Use all the feature vectors (don't reduce)

In [None]:
feature_vectors_trans = {k: list(v) for k, v in feature_vectors.items()}

node1 = 100
node2 = 30
n_nodes = 6

node_path = fv_spaced_pathway_nD(feature_vectors_trans, node1, node2, n_nodes)
image_pathway_plot(
    images_dir,
    image_type,
    node_path,
    title="{} components, {} min dist, {} neighbors".format(
        n_components, min_dist, n_neighbors
    ),
)