# Data Analysis using TSNE + Bokeh on toy image dataset

#### The code purpose is:
- Perform TSNE and display the source image, the label of the image, and the path of the source image (path need adjustment)
- Embed the images using CLIP
- Perform PCA if param value chosen is greater then 0
- Provide a nice starting point to clean TSNE pipleline, just edit the configuration cells below to change dataset and parametes

In [1]:
import os
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from bokeh.plotting import figure, show, output_notebook, output_file, save
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10
from PIL import Image
import base64
import io

def run_tsne_bokeh_visualization(
    features: np.ndarray,
    images: list,
    labels: list,
    image_paths: list = None,
    image_size: tuple = None,
    pca_n_components: int = 30,
    tsne_params: dict = None,
    figure_width: int = 900,
    figure_height: int = 700,
    output_path: str = None
):
    """
    Perform PCA → t-SNE on a high-dimensional image dataset and render an interactive Bokeh plot
    with hoverable thumbnails and labels. If `output_path` is provided, save as a standalone
    HTML file; otherwise display inline in the notebook.

    Parameters
    ----------
    features : np.ndarray
        An (N × D) array of feature vectors (e.g., CNN embeddings or raw pixel vectors).
    images : list of PIL.Image
        A list of length N containing PIL Image objects corresponding to each feature.
    labels : list or np.ndarray
        A list/array of length N containing the (string-convertible) label for each image.
    image_paths : list of str, optional
        A list of length N containing the file path (or URL) for each image. If None, tooltips
        will omit the path line.
    image_size : tuple (width, height) or None, default=None
        If None, each thumbnail uses its original PIL dimensions. Otherwise, resize each thumbnail
        to (width, height) before encoding.
    pca_n_components : int, default=30
        Number of principal components to retain before feeding into t-SNE (to accelerate convergence
        and reduce noise). If < 1, PCA is skipped and `features` are used directly.
    tsne_params : dict, optional
        Keyword args for sklearn.manifold.TSNE. If None, defaults to:
            {
                "n_components": 2,
                "perplexity": 30,
                "learning_rate": 200,
                "max_iter": 1000,  # mean no FutureWarning
                "random_state": 42,
                "init": "random",
            }
    figure_width : int, default=900
        Width of the Bokeh figure in pixels.
    figure_height : int, default=700
        Height of the Bokeh figure in pixels.
    output_path : str, default=None
        If provided, the function will save a standalone HTML to this path. Otherwise it
        renders inline via output_notebook() and show().

    Returns
    -------
    None
        Either displays an interactive Bokeh scatter plot in the notebook or writes an HTML file.
    """

    # 1. Validate inputs
    n = features.shape[0]
    if len(images) != n or len(labels) != n:
        raise ValueError("`features`, `images`, and `labels` must all have the same length N.")
    if image_paths is not None and len(image_paths) != n:
        raise ValueError("`image_paths` must be None or a list of length N.")

    # 2. Default t-SNE parameters if none provided
    if tsne_params is None:
        tsne_params = {
            "n_components": 2,
            "perplexity": 30,
            "learning_rate": 200,
            "max_iter": 1000,  # uses max_iter to avoid deprecation
            "random_state": 42,
            "init": "random",
        }

    # 3. Run PCA if requested (pca_n_components >= 1), else skip
    if pca_n_components >= 1:
        pca = PCA(n_components=min(pca_n_components, features.shape[1]))
        features_reduced = pca.fit_transform(features)
        print("PCA Done! Reduced shape:", features_reduced.shape)
    else:
        features_reduced = features
        print("PCA Skipped! Using original features shape:", features_reduced.shape)

    # 4. Run t-SNE
    tsne = TSNE(**tsne_params)
    embeddings = tsne.fit_transform(features_reduced)

    # 5. Encode thumbnails as Base64
    def pil_to_base64_with_size(img: Image.Image):
        if image_size is None:
            img_to_encode = img
            w, h = img.size
        else:
            img_to_encode = img.resize(image_size, Image.NEAREST)
            w, h = image_size
        buffer = io.BytesIO()
        img_to_encode.save(buffer, format="PNG")
        return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode(), w, h

    thumbnails_b64, thumb_ws, thumb_hs = [], [], []
    for img in images:
        b64_str, w, h = pil_to_base64_with_size(img)
        thumbnails_b64.append(b64_str)
        thumb_ws.append(w)
        thumb_hs.append(h)

    # 6. Build ColumnDataSource
    data_dict = {
        "x": embeddings[:, 0],
        "y": embeddings[:, 1],
        "thumb": thumbnails_b64,
        "thumb_w": thumb_ws,
        "thumb_h": thumb_hs,
        "label": [str(lbl) for lbl in labels],
    }
    if image_paths is not None:
        data_dict["img_path"] = image_paths
    else:
        data_dict["img_path"] = [""] * n

    source = ColumnDataSource(data=data_dict)

    # 7. Prepare the flexbox-based tooltip HTML
    tooltip_html = """
    <div style="display: flex; align-items: center;">
        <div style="flex: none; margin-right: 10px;">
            <img src="@thumb" width="@thumb_w" height="@thumb_h" alt="@thumb"/>
        </div>
        <div style="display: flex; flex-direction: column; justify-content: center;">
            <div style="font-size: 12px; font-weight: bold;">@label</div>
            <div style="font-size: 10px; color: #555; margin-top: 4px;">@img_path</div>
        </div>
    </div>
    """
    hover_tool = HoverTool(tooltips=tooltip_html)

    # 8. Color mapper for categorical labels
    unique_labels = sorted(set(data_dict["label"]))
    color_mapper = factor_cmap(
        field_name="label",
        palette=Category10[len(unique_labels)],
        factors=unique_labels,
    )

    # 9. Create figure
    p = figure(
        title="t-SNE Embeddings",
        tools=[hover_tool, "pan", "wheel_zoom", "reset"],
        x_axis_label="TSNE-1",
        y_axis_label="TSNE-2",
        width=figure_width,
        height=figure_height,
    )
    p.scatter("x", "y", size=10, source=source, color=color_mapper, alpha=0.8)

    # 10. Render or save
    if output_path:
        # Ensure parent directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        output_file(output_path)
        save(p)
        print(f"✅ Bokeh figure saved to: {output_path}")
    else:
        output_notebook()
        show(p)


In [2]:
import numpy as np
from PIL import Image

def convert_array_to_pil_images(image_array: np.ndarray, rescale: bool = True):
    """
    Convert a NumPy array of shape (N, H, W) into a list of PIL.Image objects.

    Args:
        image_array (np.ndarray): 
            The source array with shape (N, H, W). 
            Expected dtype is float-like; values probably in [0, max_val].
        rescale (bool): 
            If True, linearly scale each image from its [min, max] to [0, 255].
            If False, assumes input is already in [0, 255] or you accept truncation.

    Returns:
        List[Image.Image]: A list of PIL Image objects (mode 'L' for grayscale).
    """
    pil_images = []
    # Determine per-array scaling if needed
    for idx in range(image_array.shape[0]):
        single = image_array[idx]  # shape (H, W)
        
        if rescale:
            # Compute min/max per image to avoid saturating outliers
            min_val = single.min()
            max_val = single.max()
            # Avoid division by zero if the image is constant
            if max_val > min_val:
                scaled = (single - min_val) / (max_val - min_val) * 255.0
            else:
                # If max == min, the image is flat; produce a zeroed image
                scaled = np.zeros_like(single)
        else:
            scaled = single
        
        # Cast to uint8; values outside [0, 255] will wrap or clip
        array_uint8 = scaled.astype(np.uint8)
        # Create a PIL Image in 'L' mode (8-bit grayscale)
        img = Image.fromarray(array_uint8, mode='L')
        pil_images.append(img)
    
    return pil_images

# Example usage:

# Suppose `batch` is your array with shape (N, H, W), e.g. (6, 28, 28)
# batch = np.array([...], dtype=float)

# pil_list = convert_array_to_pil_images(numpy_array_of_images, rescale=False)
# pil_list[:5]

In [3]:
# Dataset Configurations:
from sklearn.datasets import load_digits
digits = load_digits() # load toy image dataset

# More Dataset Configurations:
numpy_array_of_images = digits.images
pil_list = convert_array_to_pil_images(numpy_array_of_images, rescale=False)
# Now pil_list is a standard Python list of PIL.Image objects
labels = digits.target.astype(str)
images = [Image.fromarray((im * 16).astype(np.uint8)) for im in digits.images]
# Adjust image_paths to get the real paths
image_paths = [f"/path/to/digits/digit_{i}.png" for i in range(len(images))]

In [4]:
# Create the features

import torch
import clip

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
print(device, model.visual.conv1, preprocess, sep='\n-----------------------------\n')
# Suppose CLIP model is loaded already.

# torch_of_embeddings = torch.empty((numpy_array_of_pil.shape[0], 512), dtype=torch.float16) #### numpy_array_of_pil ref to numpy_array_of_images in this code
tensor_of_embeddings = torch.empty((len(pil_list), 512), dtype=torch.float16)
for i in range(len(pil_list)): 
    image = pil_list[i]
    image_input = preprocess(image).unsqueeze(0).to(device)
    # Calculate features
    with torch.no_grad():
        # len of numpy_array_of_pil must be eq to numpy_array_of_embeddings len
        image_embeddings = model.encode_image(image_input)
        tensor_of_embeddings[i] = image_embeddings

array_of_embeddings = tensor_of_embeddings.cpu().numpy()

print(f'-----------------------------\n\n array_of_embeddings shape is: {array_of_embeddings.shape}')

cuda
-----------------------------
Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
-----------------------------
Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x70655d0b49a0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)
-----------------------------

 array_of_embeddings shape is: (1797, 512)


In [5]:
# TSNE + Bokeh configurations

tsne_params = {
    "n_components": 2,
    "perplexity": 50,
    "learning_rate": 150,
    "max_iter": 2000,
    "random_state": 123,
    "init": "random"
}
# features = digits.data # to use the pixel values as features
features = array_of_embeddings # to use CLIP embeddings as features

# Define output path (Optional)
output_fn = 'tsne_bokeh_toy_output.html'
output_path = os.path.join(os.getcwd(), output_fn)

# Run the core function
run_tsne_bokeh_visualization(
    features, images, labels, image_paths,
    image_size=(64, 64),
    pca_n_components=-1,
    tsne_params=tsne_params,
    figure_width=800, figure_height=600,
    output_path=output_path
)

PCA Skipped! Using original features shape: (1797, 512)
✅ Bokeh figure saved to: /home/tomer-v-u/PycharmProjects/t-sne-test/tsne_bokeh_toy_output.html


#### Requirements:

In [6]:
!pip list

Package                   Version
------------------------- --------------
anyio                     4.9.0
anywidget                 0.9.18
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.4
bleach                    6.2.0
blinker                   1.9.0
bokeh                     3.7.3
certifi                   2025.4.26
cffi                      1.17.1
charset-normalizer        3.4.2
click                     8.2.1
clip                      1.0
comm                      0.2.2
contourpy                 1.3.2
cycler                    0.12.1
dash                      3.0.4
debugpy                   1.8.14
decorator                 5.2.1
defusedxml                0.7.1
executing                 2.2.0
fastjsonschema            2.21.1
filelock                  3.13.1
filetype        

In [7]:
!python --version

Python 3.12.10


In [8]:
!pip list > requirements.txt