In [None]:
#@title # ProtSpace: Interactive Protein Embedding Visualization
#@markdown ### About ProtSpace ([github](https://github.com/tsenoner/ProtSpace))
#@markdown ProtSpace is a tool for interactive visualization of protein embeddings that:
#@markdown - Converts high-dimensional protein embeddings into 2D/3D visualizations
#@markdown - Supports multiple dimension reduction methods (PCA, UMAP, t-SNE, PaCMAP)
#@markdown - Allows annotation-based coloring and shaping of data points
#@markdown - Integrates protein structure visualization alongside embedding space
#@markdown - Enables publication-quality exports and sharing of visualization sessions

#@markdown ### Basic Workflow:
#@markdown 1. Upload protein embeddings (H5 file)
#@markdown 2. Upload feature annotations (CSV file)
#@markdown 3. Choose visualization methods
#@markdown 4. Explore your protein space interactively

In [None]:
#@title Install Dependencies and Import Libraries (~2min)
%%capture
!pip install protspace

import sys
import os
from google.colab import files
from pathlib import Path

from protspace.app import ProtSpace

In [None]:
#@title 📤 Upload Data Files
#@markdown Upload two files:
#@markdown - Embedding file (`.h5`, `.hdf5`, or `.hdf`)
#@markdown - Feature file (`.csv`)

uploaded = files.upload()

In [None]:
#@title 📂 File Paths
#@markdown ### Enter paths to your files

#@markdown #### Path to embedding file (.h5/.hdf5/.hdf):
embedding_file = "localization_embeddings.h5" #@param {type:"string"}

#@markdown #### Path to feature file (.csv):
feature_file = "localization.csv" #@param {type:"string"}

# Validate files
valid_files = True
if not embedding_file:
    print("⚠️ Please enter path to embedding file")
    valid_files = False
elif not os.path.exists(embedding_file):
    print(f"⚠️ Embedding file not found: {embedding_file}")
    valid_files = False
elif not any(embedding_file.endswith(ext) for ext in ['.h5', '.hdf5', '.hdf']):
    print(f"⚠️ Embedding file must be .h5, .hdf5, or .hdf format")
    valid_files = False

if not feature_file:
    print("⚠️ Please enter path to feature file")
    valid_files = False
elif not os.path.exists(feature_file):
    print(f"⚠️ Feature file not found: {feature_file}")
    valid_files = False
elif not feature_file.endswith('.csv'):
    print(f"⚠️ Feature file must be .csv format")
    valid_files = False

if valid_files:
    print("✅ Files validated successfully")
    output_file = str(Path(embedding_file).with_suffix('.json'))

In [None]:
#@title 🔧 Configure Visualization
#@markdown ### Choose dimension reduction methods:
#@markdown - **PCA**: Fast, linear reduction
#@markdown - **UMAP**: Preserves global and local structure
#@markdown - **t-SNE**: Focuses on local structure
#@markdown - **PaCMAP**: Balances global and local structure

use_pca = True #@param {type:"boolean"}
use_umap = True #@param {type:"boolean"}
use_tsne = False #@param {type:"boolean"}
use_pacmap = False #@param {type:"boolean"}

#@markdown ### Choose dimensions:
dimensions = "2D only" #@param ["2D only", "3D only", "2D and 3D"]

# Build methods string
methods = []

def add_method(use_method, method_name):
    if use_method:
        if dimensions == "2D only":
            methods.append(f"{method_name}2")
        elif dimensions == "3D only":
            methods.append(f"{method_name}3")
        else:
            methods.extend([f"{method_name}2", f"{method_name}3"])

add_method(use_pca, "pca")
add_method(use_umap, "umap")
add_method(use_tsne, "tsne")
add_method(use_pacmap, "pacmap")

methods_str = " ".join(methods)
params_str = ""

In [None]:
#@title ## Advanced Parameters (optional)

#@markdown #### UMAP Parameters:
if use_umap:
    #@markdown - Number of neighbors influences locality preservation
    umap_n_neighbors = 50 #@param {type:"slider", min:2, max:200, step:1}
    #@markdown - Minimum distance between points
    umap_min_dist = 0.5 #@param {type:"slider", min:0.0, max:1.0, step:0.01}
    #@markdown - Distance metric
    umap_metric = "euclidean" #@param ["euclidean", "cosine"]

# @markdown ---
#@markdown #### t-SNE Parameters:
if use_tsne:
    #@markdown - Perplexity balances local and global structure
    tsne_perplexity = 30 #@param {type:"slider", min:5, max:100, step:5}
    #@markdown - Learning rate influences optimization
    tsne_learning_rate = 200 #@param {type:"number"}

# @markdown ---
#@markdown #### PaCMAP Parameters:
if use_pacmap:
    #@markdown - Number of neighbors
    pacmap_n_neighbors = 25 #@param {type:"slider", min:2, max:100, step:1}
    #@markdown - MN ratio (Mid-Near pairs ratio): Controls local structure preservation
    #@markdown   - Higher values (→1.0): Better preserves local structure
    #@markdown   - Lower values (→0.1): Allows more global structure influence
    pacmap_mn_ratio = 0.5 #@param {type:"slider", min:0.1, max:1.0, step:0.1}
    #@markdown - FP ratio (Further Pairs ratio): Controls global structure preservation
    #@markdown   - Higher values (→5.0): Better preserves global structure, more separation between clusters
    #@markdown   - Lower values (→0.1): Focuses more on local relationships
    pacmap_fp_ratio = 2.0 #@param {type:"slider", min:0.1, max:5.0, step:0.1}
    #@markdown
    #@markdown Recommended combinations:
    #@markdown - Balanced view: MN=0.5, FP=2.0
    #@markdown - Local focus: MN=0.8, FP=1.0
    #@markdown - Global focus: MN=0.3, FP=3.0

# Build parameter string
params = []

if use_umap:
    params.extend([
        f"--n_neighbors {umap_n_neighbors}",
        f"--min_dist {umap_min_dist}",
        f"--metric {umap_metric}"
    ])

if use_tsne:
    params.extend([
        f"--perplexity {tsne_perplexity}",
        f"--learning_rate {tsne_learning_rate}"
    ])

if use_pacmap:
    params.extend([
        f"--n_neighbors {pacmap_n_neighbors}",
        f"--mn_ratio {pacmap_mn_ratio}",
        f"--fp_ratio {pacmap_fp_ratio}"
    ])

params_str = " ".join(params)

In [None]:
#@title 📊 Generate JSON File
#@markdown Generate the visualization data file

if not valid_files:
    print("⚠️ Please fix file path issues before continuing")
else:
    print(f"Generating visualization data...")
    !protspace-json -i {embedding_file} -m {feature_file} -o {output_file} --methods {methods_str} {params_str}
    print(f"✅ JSON file saved as: {output_file}")

In [None]:
#@title 🚀 Launch ProtSpace
#@markdown ### Launch the visualization interface
#@markdown Optionally specify a different JSON file, or leave empty to use the one generated above

#@markdown #### Path to JSON file (optional):
json_file = "" #@param {type:"string"}

# Use generated JSON file if no other file specified
if not json_file:
    json_file = output_file

# Validate JSON file
if not os.path.exists(json_file):
    print(f"⚠️ JSON file not found: {json_file}")
else:

    # Store the original stdout/stderr before suppressing
    original_stdout = sys.stdout
    original_stderr = sys.stderr

    def suppress_output():
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def restore_output():
        sys.stdout = original_stdout
        sys.stderr = original_stderr

    suppress_output()
    app = ProtSpace(default_json_file=json_file)
    app.run_server()
    restore_output()

## Exercise
1. Upload the localization embeddings from before
    - The feature CSV file can be found here: https://nextcloud.in.tum.de/index.php/s/Dnf2L3e3d58gaDa
1. Download pre-computed ProtSpace configuration files (see next code block)
1. Upload and look at different configuration files
1. interact with plot
  - click in the legend (single- and double-click)
  - zoom in by selecting an area
1. Explore the 3FTx dataset (three-finger toxin dataset) -> enter **protspace/3FTx.json** in the json_path field
  - look at the `membrane_prediction` feature
  - compare to `major_group` feature
  - look at the more fine-grained `group`
  - and finally the `number_cystein`
1. The GFP
1. Explore other files or generate your own file and explore it

In [None]:
# @title Get ProtSpace configuration files
!wget -O protspace.zip -r --no-parent -nH --cut-dirs=3 --reject "index.html*" -e robots=off https://nextcloud.in.tum.de/index.php/s/jb9fN3wawgTRswS/download
!unzip -j protspace.zip -d protspace