# ProtSpace: Interactive Protein Embedding Visualization

Interactive visualization of high-dimensional protein embeddings in 2D/3D space. Supports multiple dimensionality reduction methods (PCA, UMAP, t-SNE, PaCMAP) with annotation-based coloring and integrated structure viewing.

üìö [GitHub](https://github.com/tsenoner/protspace) ‚Ä¢ [Manuscript](https://www.sciencedirect.com/science/article/pii/S0022283625000063?via%3Dihub)


In [None]:
# @title Install Dependencies and Import Libraries (~1min)
%%capture
!pip install -q protspace

import gzip
import io
import os
import re
import subprocess
from pathlib import Path

import h5py
import ipywidgets as widgets
from google.colab import drive, files
from IPython.display import clear_output, display
from ipywidgets import (
    HTML,
    Button,
    Checkbox,
    FileUpload,
    FloatSlider,
    HBox,
    IntProgress,
    IntSlider,
    Output,
    SelectMultiple,
    Tab,
    VBox,
)

# üìä Data Collection & Embedding Upload

This section guides you through obtaining protein embeddings and uploading them to the notebook for visualization with ProtSpace.

## Step 1: Obtain Protein Embeddings

You can get protein embeddings in two ways:

1.  **From UniProt:**
    - Go to the [UniProt website](https://www.uniprot.org/).
    - Use the UniProt search syntax to find proteins (e.g., `(ft_domain:phosphatase) AND (reviewed:true)`).
    - Click **"Customize"** ‚Üí Select **"Embeddings"** to generate ProtT5 embeddings.
    - Download the results from your [Jobs Dashboard](https://www.uniprot.org/tool-dashboard).

2.  **From your own FASTA file:**
    - Generate embeddings using the dedicated Google Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tsenoner/protspace/blob/main/notebooks/ClickThrough_GenerateEmbeddings.ipynb)

## Step 2: Upload Your Embedding Data

Use the interactive widget in the code cell below to upload your embedding file (.h5, .hdf5, or .gz). Choose your preferred upload method and follow the on-screen instructions.


In [None]:
# @title üìÇ Upload Embedding File {display-mode: "form"}
# @markdown Choose your preferred upload method and follow the instructions.


def process_and_validate(filename, file_content, output_widget):
    """Write *file_content* to disk, validate as HDF5, return output path or None."""
    filepath = Path(filename)
    output_name = None

    try:
        if filepath.suffix == ".gz":
            output_name = filepath.stem + ".h5"
            with gzip.open(io.BytesIO(file_content), "rb") as gz_f:
                decompressed_data = gz_f.read()
            with open(output_name, "wb") as f:
                f.write(decompressed_data)
        elif filepath.suffix in (".h5", ".hdf5", ""):
            output_name = (
                filename
                if filepath.suffix in (".h5", ".hdf5")
                else filepath.name + ".h5"
            )
            with open(output_name, "wb") as f:
                f.write(file_content)
        else:
            with output_widget:
                print(
                    f"‚ùå Unsupported format '{filepath.suffix}'. Use .h5, .hdf5, or .gz."
                )
            return None

        # Validate HDF5
        with h5py.File(output_name, "r") as f:
            first_key = list(f.keys())[0]
            emb = f[first_key][:]
            if emb.ndim != 1 or emb.shape[0] == 0:
                raise ValueError(f"Expected 1-D embedding, got shape {emb.shape}")
        with output_widget:
            print(f"‚úÖ Embedding file ready: {output_name}  (dim={emb.shape[0]})")
        return output_name

    except Exception as e:
        with output_widget:
            print(f"‚ùå Failed to process '{filename}': {e}")
        if output_name and os.path.exists(output_name):
            os.remove(output_name)
        return None


# ---------------------------------------------------------------------------
# Upload methods
# ---------------------------------------------------------------------------

embedding_file = None  # global result used by later cells


def method_widget_upload(output_widget):
    """Small files (<10 MB) via ipywidgets FileUpload."""
    global embedding_file

    def on_file_upload(change):
        global embedding_file
        if not change["new"]:
            return
        filename = list(change["new"].keys())[0]
        data = change["new"][filename]["content"]
        with output_widget:
            print(f"Processing: {filename}")
        embedding_file = process_and_validate(filename, data, output_widget)

    upload_widget = widgets.FileUpload(
        accept=".h5,.hdf5,.gz",
        multiple=False,
        description="Choose File",
        button_style="primary",
    )
    upload_widget.observe(on_file_upload, names="value")

    with output_widget:
        print("üìÅ Widget Upload (for small files < 10 MB)")
        print("‚ö†Ô∏è May fail silently for files > 10 MB due to browser limits.")
        display(upload_widget)


def method_google_drive(output_widget):
    """Large files from Google Drive."""
    global embedding_file

    with output_widget:
        print("üìÅ Google Drive Upload")
        print("=" * 60)
        print("Mounting Google Drive...")
        try:
            drive.mount("/content/drive", force_remount=False)
            print("‚úÖ Google Drive mounted.")
        except Exception:
            print("‚ö†Ô∏è Google Drive already mounted.")

        drive_path = "/content/drive/MyDrive"
        embedding_files = []
        try:
            for root, _dirs, files_list in os.walk(drive_path):
                for f in files_list:
                    if Path(f).suffix in (".h5", ".hdf5", ".gz"):
                        embedding_files.append(
                            os.path.relpath(os.path.join(root, f), drive_path)
                        )
        except Exception as e:
            print(f"‚ö†Ô∏è Could not list Drive files: {e}")

        if not embedding_files:
            print("\n‚ö†Ô∏è No embedding files (.h5, .hdf5, .gz) found in your Drive.")
            return

        file_options = [
            (
                f"{f} ({os.path.getsize(os.path.join(drive_path, f)) / (1024 * 1024):.1f} MB)",
                f,
            )
            for f in sorted(
                embedding_files,
                key=lambda x: os.path.getmtime(os.path.join(drive_path, x)),
                reverse=True,
            )
        ]

        file_dropdown = widgets.Dropdown(
            options=file_options,
            description="Select File:",
            style={"description_width": "initial"},
            layout=widgets.Layout(width="80%"),
        )
        status_output = widgets.Output()

        def on_process_click(_b):
            global embedding_file
            with status_output:
                clear_output()
            filename = file_dropdown.value
            if not filename:
                with status_output:
                    print("‚ùå No file selected.")
                return
            file_path = (
                filename
                if filename.startswith("/content/drive")
                else f"/content/drive/MyDrive/{filename}"
            )
            with open(file_path, "rb") as f:
                data = f.read()
            embedding_file = process_and_validate(
                os.path.basename(filename), data, status_output
            )

        process_button = widgets.Button(
            description="Process Selected File",
            button_style="success",
            icon="check",
        )
        process_button.on_click(on_process_click)

        print("\nüìù Select your embedding file:")
        display(
            widgets.VBox(
                [
                    widgets.HBox([file_dropdown, process_button]),
                    status_output,
                ]
            )
        )


def method_colab_upload(output_widget):
    """Colab native file dialog."""
    global embedding_file

    with output_widget:
        print("üìÅ Colab Native Upload")
        print("‚è≥ A file dialog will open ‚Äî select your embedding file.")

    uploaded = files.upload()
    if len(uploaded) != 1:
        with output_widget:
            print("‚ùå Please upload exactly one embedding file.")
        embedding_file = None
        return

    filename = list(uploaded.keys())[0]
    with output_widget:
        print(f"Processing: {filename}")
    embedding_file = process_and_validate(filename, uploaded[filename], output_widget)


# ---------------------------------------------------------------------------
# Main interface
# ---------------------------------------------------------------------------

_METHODS = {1: method_widget_upload, 2: method_google_drive, 3: method_colab_upload}

method_selector = widgets.Dropdown(
    options=[
        ("üéØ Widget Upload (Quick, <10 MB only)", 1),
        ("üöÄ Google Drive (Fastest - Recommended)", 2),
        ("üì§ Colab Native Upload (Reliable)", 3),
    ],
    value=1,
    description="Method:",
    style={"description_width": "initial"},
)

start_button = widgets.Button(
    description="Start Upload", button_style="success", icon="upload"
)
output_area = widgets.Output()


def on_start_click(_b):
    with output_area:
        clear_output()
    _METHODS[method_selector.value](output_area)


start_button.on_click(on_start_click)

print("=" * 60)
print("üìÇ EMBEDDING FILE UPLOAD")
print("=" * 60)
print("\nüìå Choose your upload method:")
print("   ‚Ä¢ Widget: Quick and convenient for small files (under 10 MB).")
print("   ‚Ä¢ Google Drive: Fastest and recommended for large files.")
print("   ‚Ä¢ Colab Native: Reliable for various file sizes.\n")

display(widgets.VBox([method_selector, start_button]))
display(output_area)

In [None]:
# @title üöÄ Generate ProtSpace Parquet Bundle {display-mode: "form"}
# @markdown Configure and run `protspace-local` to generate visualization files

# Annotation categories matching the CLI
ANNOTATIONS = {
    "UniProt": [
        "annotation_score",
        "cc_subcellular_location",
        "ec",
        "fragment",
        "gene_name",
        "go_bp",
        "go_cc",
        "go_mf",
        "keyword",
        "length_fixed",
        "length_quantile",
        "protein_existence",
        "protein_families",
        "reviewed",
        "xref_pdb",
    ],
    "InterPro": [
        "cath",
        "cdd",
        "panther",
        "pfam",
        "prints",
        "prosite",
        "signal_peptide",
        "smart",
        "superfamily",
    ],
    "Taxonomy": [
        "root",
        "domain",
        "kingdom",
        "phylum",
        "class",
        "order",
        "family",
        "genus",
        "species",
    ],
}

METHODS = ["PCA", "UMAP", "t-SNE", "MDS", "PaCMAP"]

# Method parameters: (min, max, default)
METHOD_PARAMS = {
    "UMAP": {"n_neighbors": (5, 200, 30), "min_dist": (0.0, 1.0, 0.5)},
    "t-SNE": {"perplexity": (5, 50, 30), "learning_rate": (10, 1000, 200)},
    "PaCMAP": {
        "n_neighbors": (5, 200, 30),
        "mn_ratio": (0.1, 1.0, 0.5),
        "fp_ratio": (1.0, 5.0, 2.0),
    },
    "MDS": {"n_init": (1, 10, 4), "max_iter": (50, 1000, 300)},
}

# Progress bar task names keyed by substring found in CLI output
_PROGRESS_TASKS = {
    "UniProt annotations": ("uniprot", "üîç Fetching UniProt Annotations"),
    "taxonomy annotations": ("taxonomy", "üåø Fetching Taxonomy Annotations"),
    "InterPro annotations": ("interpro", "üß¨ Fetching InterPro Annotations"),
}

# Lines from the subprocess we don't want to show
_SKIP_PATTERNS = [
    "Unable to register cu",
    "WARNING: All log messages",
    "E0000",
    "W0000",
    "This TensorFlow binary",
    "Creating directory",
    "Welcome to Bioservices",
    "It looks like you do not have",
    "We are creating one with default",
    "Done",
    "To enable the following instructions",
]


class ProtSpaceConfigWidget:
    def __init__(self):
        self._build_widgets()
        self._build_layout()

    # ------------------------------------------------------------------
    # Widget construction
    # ------------------------------------------------------------------

    def _build_widgets(self):
        # Annotation tabs
        annotation_widgets = {}
        defaults = {
            "UniProt": [
                "cc_subcellular_location",
                "reviewed",
                "fragment",
                "length_fixed",
                "protein_families",
            ],
            "InterPro": ["pfam", "cath"],
            "Taxonomy": [],
        }
        for category, names in ANNOTATIONS.items():
            annotation_widgets[category] = SelectMultiple(
                options=names,
                value=defaults.get(category, []),
                description="",
                style={"description_width": "initial"},
                layout={"height": "120px", "width": "300px"},
            )
        self.annotation_tabs = Tab(children=list(annotation_widgets.values()))
        for i, cat in enumerate(ANNOTATIONS):
            self.annotation_tabs.set_title(i, cat)

        # CSV metadata upload
        self.csv_upload = FileUpload(
            accept=".csv,.tsv", multiple=False, description="Choose CSV"
        )

        # Method selection
        self.methods = SelectMultiple(
            options=METHODS,
            value=["PCA", "UMAP"],
            description="",
            style={"description_width": "initial"},
            layout={"height": "120px", "width": "200px"},
        )

        # Method parameter sliders
        self.param_widgets = {}
        param_tab_children, param_tab_titles = [], []
        for method, params in METHOD_PARAMS.items():
            method_widgets = {}
            for pname, (lo, hi, default) in params.items():
                slider_cls = IntSlider if isinstance(default, int) else FloatSlider
                kw = (
                    {"step": 0.1 if hi <= 1.0 else 10}
                    if slider_cls is FloatSlider
                    else {}
                )
                method_widgets[pname] = slider_cls(
                    value=default,
                    min=lo,
                    max=hi,
                    description=pname.replace("_", " ").title() + ":",
                    style={"description_width": "120px"},
                    layout={"width": "300px"},
                    **kw,
                )
            self.param_widgets[method] = method_widgets
            param_tab_children.append(VBox(list(method_widgets.values())))
            param_tab_titles.append(method)
        self.param_tabs = Tab(children=param_tab_children)
        for i, t in enumerate(param_tab_titles):
            self.param_tabs.set_title(i, t)

        # Options
        self.keep_temp = Checkbox(value=False, description="Keep temporary files")

        # Run button + output
        self.run_button = Button(
            description="üöÄ Generate Bundle",
            button_style="primary",
            layout={"width": "200px"},
        )
        self.run_button.on_click(self._on_run)
        self.output = Output()

    def _build_layout(self):
        self.widget = VBox(
            [
                VBox(
                    [
                        HTML("<h3>üìã Select Annotations</h3>"),
                        HTML(
                            "<p><i>Hold Ctrl/Cmd to select multiple. Taxonomy downloads a database on first use (~1 min).</i></p>"
                        ),
                        self.annotation_tabs,
                    ]
                ),
                VBox(
                    [
                        HTML("<h3>üìÑ Custom CSV Metadata (optional)</h3>"),
                        HTML(
                            "<p><i>Upload a CSV/TSV with per-protein annotations. First column = protein identifiers.</i></p>"
                        ),
                        self.csv_upload,
                    ]
                ),
                VBox(
                    [
                        HTML("<h3>üìä Select 2D Reduction Methods</h3>"),
                        HTML("<p><i>Hold Ctrl/Cmd to select multiple.</i></p>"),
                        self.methods,
                    ]
                ),
                VBox(
                    [
                        HTML("<h3>‚öôÔ∏è Method Parameters</h3>"),
                        HTML("<p><i>PCA has no adjustable parameters.</i></p>"),
                        self.param_tabs,
                    ]
                ),
                VBox(
                    [
                        HTML("<h3>üîß General Options</h3>"),
                        self.keep_temp,
                    ]
                ),
                HBox([self.run_button]),
                self.output,
            ]
        )

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    def _save_uploaded_csv(self):
        """Save uploaded CSV to disk and return the path, or None."""
        if not self.csv_upload.value:
            return None
        first = list(self.csv_upload.value)[0]
        content = first.get("content", b"")
        if not content:
            return None
        csv_path = Path(first.get("name", "custom_annotations.csv"))
        csv_path.write_bytes(content)
        return str(csv_path)

    def _selected_annotations(self):
        selected = []
        for w in self.annotation_tabs.children:
            selected.extend(w.value)
        return selected

    def _method_commands(self):
        return [m.lower().replace("-", "") + "2" for m in self.methods.value]

    # ------------------------------------------------------------------
    # Build & run CLI command
    # ------------------------------------------------------------------

    def _on_run(self, _button):
        with self.output:
            clear_output()

            # Validate embedding file
            input_file = globals().get("embedding_file")
            if not input_file or not os.path.exists(input_file):
                print(
                    "‚ùå No valid embedding file found. Upload one in the section above first."
                )
                return

            input_path = Path(input_file)
            output_path = input_path.parent / (input_path.stem + ".parquetbundle")
            output_path.parent.mkdir(parents=True, exist_ok=True)

            annotations = self._selected_annotations()
            method_cmds = self._method_commands()
            csv_path = self._save_uploaded_csv()

            if not method_cmds:
                print("‚ùå Select at least one dimensionality reduction method.")
                return

            # Build command
            cmd = [
                "protspace-local",
                "-i",
                input_file,
                "-m",
                ",".join(method_cmds),
                "-o",
                str(output_path),
            ]
            if csv_path:
                cmd.extend(["-a", csv_path])
            if annotations:
                cmd.extend(["-a", ",".join(annotations)])
            for method in self.methods.value:
                if method in METHOD_PARAMS:
                    for pname, w in self.param_widgets[method].items():
                        cmd.extend([f"--{pname}", str(w.value)])
            if self.keep_temp.value:
                cmd.append("--keep-tmp")

            # Print summary
            print("--- ProtSpace Configuration ---")
            print(f"Input:   {input_file}")
            print(f"Output:  {output_path}")
            print(f"Methods: {', '.join(self.methods.value)}")
            if csv_path:
                print(f"CSV:     {csv_path}")
            print(f"Annotations: {', '.join(annotations) if annotations else 'None'}")
            print(f"Command: {' '.join(cmd)}\n")

            self._execute(cmd, output_path)

    def _execute(self, cmd, output_path):
        """Run *cmd* and render progress bars from tqdm output."""
        progress_widgets = {}
        status = HTML(value="<b>üöÄ Starting ProtSpace processing...</b>")
        display(status)

        try:
            proc = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                bufsize=1,
                universal_newlines=True,
            )

            for line in proc.stdout:
                line = line.strip()
                if any(p in line for p in _SKIP_PATTERNS):
                    continue

                # Update Jupyter progress bars from tqdm lines
                if "Fetching" in line and "%|" in line:
                    for substr, (key, label) in _PROGRESS_TASKS.items():
                        if substr not in line:
                            continue
                        if key not in progress_widgets:
                            bar = IntProgress(
                                value=0,
                                min=0,
                                max=100,
                                bar_style="info",
                                style={"bar_color": "#17a2b8"},
                                layout={"width": "400px"},
                            )
                            lbl = HTML(value=f"<b>{label}</b>")
                            progress_widgets[key] = {"bar": bar, "label": lbl}
                            display(VBox([lbl, bar]))
                        match = re.search(r"(\d+)%", line)
                        if match:
                            pct = int(match.group(1))
                            progress_widgets[key]["bar"].value = pct
                            if pct == 100:
                                progress_widgets[key]["bar"].bar_style = "success"
                                progress_widgets[key][
                                    "label"
                                ].value = f"<b>{label} ‚úÖ</b>"

            proc.wait()

            if proc.returncode == 0:
                status.value = "<b>‚úÖ ProtSpace bundle generated successfully!</b>"
                print(f"\nOutput: {output_path}")
                globals()["protspace_output_file"] = str(output_path)
                print("Ready for download ‚Äî proceed to the next cell.")
            else:
                status.value = (
                    f"<b>‚ùå Processing failed (exit code {proc.returncode})</b>"
                )

        except FileNotFoundError:
            print("‚ùå protspace-local not found. Make sure ProtSpace is installed.")
        except Exception as e:
            print(f"‚ùå Unexpected error: {e}")


config_widget = ProtSpaceConfigWidget()
display(config_widget.widget)

In [None]:
# @title üì• Download ProtSpace Bundle {display-mode: "form"}
# @markdown Run this cell, then upload the downloaded file at https://protspace.app/

if "protspace_output_file" not in globals():
    print("‚ùå No output file found. Run the generation cell first.")
else:
    output_path = Path(globals()["protspace_output_file"])
    if output_path.exists():
        size_mb = output_path.stat().st_size / (1024 * 1024)
        print(f"‚úÖ {output_path.name} ({size_mb:.2f} MB)")
        print("üì¶ Starting download...\n")
        try:
            files.download(str(output_path))
            print("‚úÖ Download complete!")
            print(f"üåê Next: upload {output_path.name} at https://protspace.app/")
        except Exception as e:
            print(f"‚ùå Download failed: {e}")
    else:
        print(f"‚ùå Output file not found: {output_path}")