# ProtSpace: Interactive Protein Embedding Visualization

Interactive visualization of high-dimensional protein embeddings in 2D/3D space. Supports multiple dimensionality reduction methods (PCA, UMAP, t-SNE, PaCMAP) with annotation-based coloring and integrated structure viewing.

📚 [GitHub](https://github.com/tsenoner/protspace) • [Manuscript](https://www.sciencedirect.com/science/article/pii/S0022283625000063?via%3Dihub)


In [None]:
# @title Install Dependencies and Import Libraries (~1min)
%%capture
# !pip install -q "git+https://github.com/tsenoner/protspace.git@8c3f3e9df07a98a2ee0a85d91d592f001feb9139#egg=protspace"
!pip install -q protspace

import gzip
import os
import requests
import subprocess
import sys
from google.colab import files
from ipywidgets import SelectMultiple, Tab, VBox, HBox, Text, Checkbox, Button, Output, HTML, IntSlider, FloatSlider
from IPython.display import display, clear_output
from pathlib import Path
from urllib.parse import urlparse

import h5py
import numpy as np
import pandas as pd

# 📊 Data Collection & Embedding Upload

This section guides you through obtaining protein embeddings and uploading them to the notebook for visualization with ProtSpace.

## Step 1: Obtain Protein Embeddings

You can get protein embeddings in two ways:

1.  **From UniProt:**
    *   Go to the [UniProt website](https://www.uniprot.org/).
    *   Use the UniProt search syntax to find proteins (e.g., `(ft_domain:phosphatase) AND (reviewed:true)`).
    *   Click **"Customize"** → Select **"Embeddings"** to generate ProtT5 embeddings.
    *   Download the results from your [Jobs Dashboard](https://www.uniprot.org/tool-dashboard).

2.  **From your own FASTA file:**
    *   Generate embeddings using the dedicated Google Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tsenoner/protspace/blob/main/examples/notebook/ClickThrough_GenerateEmbeddings.ipynb)

## Step 2: Upload Your Embedding Data

Use the interactive widget in the code cell below to upload your embedding file (.h5, .hdf5, or .gz). Choose your preferred upload method and follow the on-screen instructions.

In [None]:
# @title 📂 Upload Embedding File {display-mode: "form"}
# @markdown Choose your preferred upload method and follow the instructions.

import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import drive, files
import h5py
import gzip
from pathlib import Path
import os
import io

# ============================================================================
# SHARED PROCESSING FUNCTION
# ============================================================================

def process_and_validate(filename, file_content, output_widget):
    """Process and validate embedding file from bytes content."""
    filepath = Path(filename)
    output_name = None
    success_label = widgets.HTML(value="") # Create a success label widget

    try:
        # Handle compressed .gz files
        if filepath.suffix == '.gz':
            output_name = filepath.stem + '.h5'
            with gzip.open(io.BytesIO(file_content), 'rb') as gz_f:
                decompressed_data = gz_f.read()
            with open(output_name, 'wb') as f:
                f.write(decompressed_data)

        # Handle uncompressed HDF5 or files with no extension (assume HDF5)
        elif filepath.suffix in ['.h5', '.hdf5', '']:
            output_name = filename if filepath.suffix in ['.h5', '.hdf5'] else filepath.name + '.h5'
            with open(output_name, 'wb') as f:
                f.write(file_content)
        else:
            with output_widget:
                print(f"❌ Error: Unsupported file format '{filepath.suffix}' for '{filename}'. Please upload a .h5, .hdf5, or .gz file.")
            return None, success_label

        # Validate HDF5 and check embeddings
        if output_name and os.path.exists(output_name):
            try:
                with h5py.File(output_name, 'r') as f:
                    first_key = list(f.keys())[0]
                    first_embedding = f[first_key][:]
                    if len(first_embedding.shape) == 1 and first_embedding.shape[0] > 0:
                        with output_widget:
                            print(f"✅ Validation successful: Found embeddings for key '{first_key}' with dimension {first_embedding.shape[0]}.")
                        # Show prominent success message
                        success_label.value = f"""
                        <div style="background-color: #d4edda; border: 2px solid #28a745;
                                    border-radius: 5px; padding: 15px; margin-top: 10px;">
                            <strong style="color: #155724; font-size: 16px;">
                                ✅ SUCCESS: Embedding file ready: {output_name}
                            </strong>
                        </div>
                        """
                        return output_name, success_label
                    else:
                        with output_widget:
                            print(f"❌ Validation failed: Invalid embedding format in '{filename}'. Expected 1D embeddings.")
                        os.remove(output_name) # Clean up invalid file
                        return None, success_label
            except (OSError, IndexError):
                with output_widget:
                    print(f"❌ Validation failed: Could not open HDF5 file or no embeddings found in '{filename}'.")
                if os.path.exists(output_name):
                    os.remove(output_name) # Clean up invalid file
                return None, success_label
        else:
            with output_widget:
                 print(f"❌ Error: Failed to create processed file from '{filename}'.")
            return None, success_label

    except Exception as e:
        with output_widget:
            print(f"❌ An unexpected error occurred while processing '{filename}': {e}")
        if output_name and os.path.exists(output_name):
            os.remove(output_name) # Clean up on error
        return None, success_label


# ============================================================================
# METHOD 1: WIDGET UPLOAD (Small files, < 10MB)
# ============================================================================

def method1_widget_upload(output_widget):
    """Upload using ipywidgets (original method)."""
    global embedding_file
    global success_display_widget # Keep track of the success message widget

    def on_file_upload(change):
        """Handle file upload event."""
        global embedding_file
        global success_display_widget

        if len(change['new']) == 0:
            return

        if len(change['new']) > 1:
            with output_widget:
                print(f"⚠️ Warning: Multiple files uploaded. Please upload only one embedding file.")
            embedding_file = None
            # Clear previous success message if any
            if success_display_widget:
                success_display_widget.value = ""
            return

        # Process the single uploaded file
        filename = list(change['new'].keys())[0]
        uploaded_file = change['new'][filename]
        data = uploaded_file['content']

        with output_widget:
            print(f"Processing uploaded file: {filename}")
        embedding_file, success_widget = process_and_validate(filename, data, output_widget)

        if embedding_file:
            # Display the success banner
            success_display_widget = success_widget
            display(success_display_widget)
        else:
            with output_widget:
                print(f"\n❌ File processing failed.")
            # Clear previous success message if any
            if success_display_widget:
                success_display_widget.value = ""


    # Create FileUpload widget
    upload_widget = widgets.FileUpload(
        accept='.h5,.hdf5,.gz',
        multiple=False,
        description='Choose File',
        button_style='primary'
    )

    # Attach event handler
    upload_widget.observe(on_file_upload, names='value')

    # Display instructions and widget
    with output_widget:
        print("📁 Method 1: Widget Upload (for small files < 10MB)")
        print("⚠️ Note: This method may fail silently for files larger than 10MB due to browser limitations.")
        display(upload_widget)

# ============================================================================
# METHOD 2: GOOGLE DRIVE (Large files, fastest!)
# ============================================================================

def is_embedding_file(filepath):
    """Check if file is a potential embedding file."""
    path = Path(filepath)
    # Only check for specific extensions (.h5, .hdf5, .gz)
    return path.suffix in ['.h5', '.hdf5', '.gz']


def method2_google_drive(output_widget):
    """Upload from Google Drive."""
    global embedding_file
    global success_display_widget # Keep track of the success message widget


    with output_widget:
        print("📁 Method 2: Google Drive (FASTEST!)")
        print("=" * 60)

        # Mount Google Drive
        print("📁 Mounting Google Drive...")
        try:
            drive.mount('/content/drive', force_remount=False)
            print("✅ Google Drive mounted successfully.")
        except:
            print("⚠️ Google Drive already mounted.")

        # List embedding files in My Drive
        print("\n🔍 Searching for potential embedding files (.h5, .hdf5, .gz) in your Drive...")
        drive_path = '/content/drive/MyDrive'

        embedding_files = []
        try:
            for root, dirs, files_list in os.walk(drive_path):
                for f in files_list:
                    full_path = os.path.join(root, f)
                    if is_embedding_file(full_path):
                        rel_path = os.path.relpath(full_path, drive_path)
                        embedding_files.append(rel_path)

            if not embedding_files:
                print("\n⚠️ No embedding files found (.h5, .hdf5, .gz) in your Google Drive.")
                print("💡 Please upload your embedding file to Google Drive first.")

        except Exception as e:
            print(f"⚠️ Could not list files from Google Drive: {e}")
            embedding_files = []

        # Create dropdown for filename selection
        print("\n" + "=" * 60)

        file_options = [(f"{f} ({os.path.getsize(os.path.join(drive_path, f)) / (1024 * 1024):.1f} MB)", f) for f in sorted(embedding_files, key=lambda x: os.path.getmtime(os.path.join(drive_path, x)), reverse=True)]

        file_dropdown = widgets.Dropdown(
            options=file_options,
            description='Select File:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='80%'),
            disabled=not bool(embedding_files) # Disable if no files found
        )

        process_button = widgets.Button(
            description='Process Selected File',
            button_style='success',
            icon='check',
            disabled=not bool(embedding_files) # Disable if no files found
        )

        status_output_inner = widgets.Output() # Inner output for processing feedback
        success_display_widget = widgets.HTML(value="") # Success message area

        def on_process_click(b):
            """Process the file from Google Drive."""
            global embedding_file
            global success_display_widget

            # Clear previous success message
            success_display_widget.value = ""
            with status_output_inner:
                clear_output()

            filename = file_dropdown.value

            if not filename:
                with status_output_inner:
                    print("❌ No file selected from the dropdown.")
                return

            # Determine full path
            if filename.startswith('/content/drive'):
                file_path = filename
            else:
                file_path = f'/content/drive/MyDrive/{filename}'

            # Check if file exists (should exist if listed in dropdown)
            if not os.path.exists(file_path):
                with status_output_inner:
                    print(f"❌ Error: File not found at '{file_path}'. Please select a file from the dropdown or check its path.")
                    print("💡 Make sure the file path is correct.")
                embedding_file = None
            else:
                with status_output_inner:
                    print(f"📥 Reading file from Drive: {file_path}")

                try:
                    # Read file content
                    with open(file_path, 'rb') as f:
                        data = f.read()

                    # Get just the filename for processing
                    base_filename = os.path.basename(filename)
                    embedding_file, success_widget = process_and_validate(base_filename, data, status_output_inner)

                    if embedding_file:
                        # Display the success banner
                        success_display_widget.value = success_widget.value

                    else:
                        with status_output_inner:
                            print(f"\n❌ File processing failed.")
                        # Clear previous success message if any
                        success_display_widget.value = ""

                except Exception as e:
                    with status_output_inner:
                        print(f"❌ An unexpected error occurred while reading file from Drive: {e}")
                    embedding_file = None
                    # Clear previous success message if any
                    success_display_widget.value = ""


        process_button.on_click(on_process_click)

        if embedding_files:
             with output_widget:
                print("📝 Select your embedding file from the dropdown below:")
                display(widgets.VBox([widgets.HBox([file_dropdown, process_button]), status_output_inner, success_display_widget]))
        else:
             with output_widget:
                 print("\n") # Add a newline for cleaner separation


# ============================================================================
# METHOD 3: COLAB NATIVE UPLOAD (Medium files, slower but reliable)
# ============================================================================

def method3_colab_upload(output_widget):
    """Upload using Colab's native file upload."""
    global embedding_file
    global success_display_widget # Keep track of the success message widget

    with output_widget:
        print("📁 Method 3: Colab Native Upload")
        print("⏳ A file dialog will open in your browser. Please select your embedding file.")
        print("⚠️ This method uploads directly from your computer and may be slow for large files.")

    # Upload file
    uploaded = files.upload()

    # Process uploaded files
    if len(uploaded) == 0:
        with output_widget:
            print("❌ No file was selected for upload.")
        embedding_file = None
        # Clear previous success message if any
        if success_display_widget:
            success_display_widget.value = ""
    elif len(uploaded) > 1:
        with output_widget:
            print("⚠️ Warning: Multiple files uploaded. Please upload only one embedding file at a time.")
        embedding_file = None
        # Clear previous success message if any
        if success_display_widget:
            success_display_widget.value = ""
    else:
        # Process the single uploaded file
        filename = list(uploaded.keys())[0]
        data = uploaded[filename]

        with output_widget:
            print(f"\nProcessing uploaded file: {filename}")
        embedding_file, success_widget = process_and_validate(filename, data, output_widget)

        if embedding_file:
            # Display the success banner
            success_display_widget = success_widget
            display(success_display_widget)
        else:
            with output_widget:
                print(f"\n❌ File processing failed.")
            # Clear previous success message if any
            if success_display_widget:
                success_display_widget.value = ""


# ============================================================================
# MAIN INTERFACE
# ============================================================================

# Initialize global variable
embedding_file = None
success_display_widget = None # Initialize the success message widget variable

# Create method selection dropdown
method_selector = widgets.Dropdown(
    options=[
        ('🎯 Widget Upload (Quick, <10MB only)', 1),
        ('🚀 Google Drive (Fastest - Recommended)', 2),
        ('📤 Colab Native Upload (Reliable)', 3),
    ],
    value=1,
    description='Method:',
    style={'description_width': 'initial'}
)

start_button = widgets.Button(
    description='Start Upload',
    button_style='success',
    icon='upload'
)

output_area = widgets.Output()

def on_start_click(b):
    """Handle start button click."""
    global success_display_widget # Access the global success message widget

    method = method_selector.value

    with output_area:
        clear_output()
        # Clear previous success message when starting a new upload
        if success_display_widget:
             success_display_widget.value = ""

        if method == 1:
            method1_widget_upload(output_area)
        elif method == 2:
            method2_google_drive(output_area)
        elif method == 3:
            method3_colab_upload(output_area)

start_button.on_click(on_start_click)

# Display interface
print("="*60)
print("📂 EMBEDDING FILE UPLOAD")
print("="*60)
print("\n📌 Choose your upload method:")
print("   • Widget: Quick and convenient for small files (under 10MB).")
print("   • Google Drive: Fastest and recommended for large files. Ensure file is in your Drive.")
print("   • Colab Native: Reliable for various file sizes, uploads directly from your computer.")
print("\n")

display(widgets.VBox([method_selector, start_button]))
display(output_area)

In [None]:
# @title 🚀 Generate ProtSpace Parquet Bundle {display-mode: "form"}
# @markdown Configure and run `protspace-local` to generate visualization files

import subprocess
import os
from pathlib import Path
from ipywidgets import SelectMultiple, Tab, VBox, HBox, Text, Checkbox, Button, Output, HTML, IntSlider, FloatSlider
from IPython.display import display, clear_output

# Updated feature categories based on user specifications
FEATURES = {
    'UniProt': [
        'annotation_score',
        'cc_subcellular_location',
        'fragment',
        'length_fixed',
        'length_quantile',
        'protein_existence',
        'protein_families',
        'reviewed',
        'xref_pdb'
    ],
    'InterPro': [
        'cath',
        'pfam',
        'signal_peptide',
        'superfamily'
    ],
    'Taxonomy': [
        'root',
        'domain',
        'kingdom',
        'phylum',
        'class',
        'order',
        'family',
        'genus',
        'species'
    ]
}

METHODS = ['PCA', 'UMAP', 't-SNE', 'MDS', 'PaCMAP']

# Method parameters with defaults
METHOD_PARAMS = {
    'UMAP': {
        'n_neighbors': (5, 200, 30),  # (min, max, default)
        'min_dist': (0.0, 1.0, 0.5)
    },
    't-SNE': {
        'perplexity': (5, 50, 30),
        'learning_rate': (10, 1000, 200)
    },
    'PaCMAP': {
        'n_neighbors': (5, 200, 30),
        'mn_ratio': (0.1, 1.0, 0.5),
        'fp_ratio': (1.0, 5.0, 2.0)
    },
    'MDS': {
        'n_init': (1, 10, 4),
        'max_iter': (50, 1000, 300)
    }
}

class ProtSpaceConfigWidget:
    def __init__(self):
        self.setup_widgets()
        self.create_layout()

    def setup_widgets(self):
        # Feature selection tabs
        feature_widgets = {}
        for category, features in FEATURES.items():
            # Set default selections per category
            if category == 'UniProt':
                default_selection = ['cc_subcellular_location', 'reviewed', 'fragment', 'length_fixed', 'protein_families']
            elif category == 'InterPro':
                default_selection = ["pfam", "cath"]
            else:  # Taxonomy
                default_selection = []

            feature_widgets[category] = SelectMultiple(
                options=features,
                value=default_selection,
                description='',
                style={'description_width': 'initial'},
                layout={'height': '120px', 'width': '300px'}
            )

        self.feature_tabs = Tab()
        self.feature_tabs.children = list(feature_widgets.values())
        for i, category in enumerate(FEATURES.keys()):
            self.feature_tabs.set_title(i, category)

        # Method selection
        self.methods = SelectMultiple(
            options=METHODS,
            value=['PCA', 'UMAP'],
            description='',
            style={'description_width': 'initial'},
            layout={'height': '120px', 'width': '200px'}
        )

        # Method parameter widgets
        self.param_widgets = {}
        for method, params in METHOD_PARAMS.items():
            method_widgets = {}
            for param_name, (min_val, max_val, default_val) in params.items():
                if isinstance(default_val, int):
                    widget = IntSlider(
                        value=default_val,
                        min=min_val,
                        max=max_val,
                        description=param_name.replace('_', ' ').title() + ':',
                        style={'description_width': '120px'},
                        layout={'width': '300px'}
                    )
                else:
                    widget = FloatSlider(
                        value=default_val,
                        min=min_val,
                        max=max_val,
                        step=0.1 if max_val <= 1.0 else 10,
                        description=param_name.replace('_', ' ').title() + ':',
                        style={'description_width': '120px'},
                        layout={'width': '300px'}
                    )
                method_widgets[param_name] = widget
            self.param_widgets[method] = method_widgets

        # Create parameter tabs
        param_tab_children = []
        param_tab_titles = []
        for method in METHOD_PARAMS.keys():
            widgets_list = list(self.param_widgets[method].values())
            param_tab_children.append(VBox(widgets_list))
            param_tab_titles.append(method)

        self.param_tabs = Tab()
        self.param_tabs.children = param_tab_children
        for i, title in enumerate(param_tab_titles):
            self.param_tabs.set_title(i, title)

        # Other options
        self.keep_temp = Checkbox(
            value=False,
            description='Keep temporary files'
        )

        self.legacy_json = Checkbox(
            value=False,
            description='Generate legacy JSON format (instead of Parquet bundle)'
        )

        # Action button and output
        self.run_button = Button(
            description='🚀 Generate Bundle',
            button_style='primary',
            layout={'width': '200px'}
        )
        self.run_button.on_click(self.run_protspace)

        self.output = Output()

    def create_layout(self):
        # Features section
        features_section = VBox([
            HTML('<h3>📋 Select Features</h3>'),
            HTML('<p><i>Choose features to include for coloring and analysis (hold Ctrl/Cmd to select multiple).</i></p>'),
            HTML('<p>Note: Selecting Taxonomy features for the first time will take ~1min to download the required database.</p>'),
            self.feature_tabs
        ])

        # Methods section
        methods_section = VBox([
            HTML('<h3>📊 Select 2D Reduction Methods</h3>'),
            HTML('<p><i>Choose dimensionality reduction methods for visualization (hold Ctrl/Cmd to select multiple).</i></p>'),
            self.methods
        ])

        # Parameters section
        params_section = VBox([
            HTML('<h3>⚙️ Method Parameters</h3>'),
            HTML('<p><i>Adjust parameters for selected methods (PCA has no adjustable parameters).</i></p>'),
            self.param_tabs
        ])

        # Options section
        options_section = VBox([
            HTML('<h3>🔧 General Options</h3>'),
            self.keep_temp,
            self.legacy_json
        ])

        # Main layout
        self.widget = VBox([
            features_section,
            methods_section,
            params_section,
            options_section,
            HBox([self.run_button]),
            self.output
        ])

    def get_selected_features(self):
        """Get all selected features from all tabs."""
        selected = []
        for widget in self.feature_tabs.children:
            selected.extend(widget.value)
        return selected

    def get_method_commands(self):
        """Build method commands with parameters."""
        selected_methods = list(self.methods.value)
        method_commands = []

        for method in selected_methods:
            if method == 'PCA':
                method_commands.append('pca2')
            else:
                # Convert display name to command format
                method_cmd = method.lower().replace('-', '') + '2'
                if method_cmd == 'tsne2':
                    method_cmd = 'tsne2'

                method_commands.append(method_cmd)

        return method_commands

    def run_protspace(self, button):
        """Generate protspace command and execute it."""
        with self.output:
            clear_output()

            # Check if embedding file exists
            input_embedding_file = globals().get('embedding_file')
            if not input_embedding_file or not os.path.exists(input_embedding_file):
                print("❌ Error: No valid embedding file found. Please upload an embedding file in the section above first.")
                return False

            # Determine output file path based on input file
            input_path = Path(input_embedding_file)
            output_extension = ".json" if self.legacy_json.value else ".parquetbundle"
            output_file_name = input_path.stem + output_extension
            # The output directory will be the same as the input file's directory
            output_dir = input_path.parent

            # Ensure output directory exists
            output_dir.mkdir(parents=True, exist_ok=True)

            # Get selections
            features = self.get_selected_features()
            method_commands = self.get_method_commands()

            if not method_commands:
                print("❌ Error: Please select at least one dimensionality reduction method.")
                return False

            # Build base command
            cmd = [
                "protspace-local",
                "-i", input_embedding_file,
                "-m", ','.join(method_commands),
                "-o", str(output_dir / output_file_name) # Specify the output file path
            ]

            # Add features if specified
            if features:
                cmd.extend(["-f", ','.join(features)])

            # Add method parameters
            selected_methods = list(self.methods.value)
            for method in selected_methods:
                if method in METHOD_PARAMS:
                    for param_name, widget in self.param_widgets[method].items():
                        cmd.extend([f'--{param_name}', str(widget.value)])

            # Add optional flags
            if self.keep_temp.value:
                cmd.append("--keep-tmp")

            if self.legacy_json.value:
                cmd.append("--non-binary")

            # Display configuration summary
            print("--- ProtSpace Configuration ---")
            print(f"Input File: {input_embedding_file}")
            print(f"Output File: {output_dir / output_file_name}")
            print(f"Selected Methods: {', '.join(selected_methods)}")
            if features:
                print(f"Selected Features: {', '.join(features)}")
            else:
                print("Selected Features: None")
            # Show parameters for selected methods
            for method in selected_methods:
                 if method in METHOD_PARAMS:
                    params_str = ', '.join([
                        f'{param}={widget.value}'
                        for param, widget in self.param_widgets[method].items()
                    ])
                    print(f"{method} Parameters: {params_str}")
            print(f"Keep Temporary Files: {'Yes' if self.keep_temp.value else 'No'}")
            print(f"Generate Legacy JSON: {'Yes' if self.legacy_json.value else 'No'}")
            print("-----------------------------")
            print("\nExecuting command:")
            print(" ".join(cmd))
            print()

            # Execute command with Jupyter-native progress bars
            try:
                from ipywidgets import IntProgress, HTML as HTMLWidget, VBox as WidgetVBox
                from IPython.display import display as widget_display
                import re

                # Create progress tracking widgets
                progress_widgets = {}
                status_widget = HTMLWidget(value="<b>🚀 Starting ProtSpace processing...</b>")
                widget_display(status_widget)

                # Run the process
                process = subprocess.Popen(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    text=True,
                    bufsize=1,
                    universal_newlines=True
                )

                # Filter patterns for unwanted output
                skip_patterns = [
                    'Unable to register cu', 'WARNING: All log messages', 'E0000', 'W0000',
                    'This TensorFlow binary', 'Creating directory', 'Welcome to Bioservices',
                    'It looks like you do not have', 'We are creating one with default', 'Done',
                    'To enable the following instructions'
                ]

                current_tasks = set()

                # Stream output and update progress bars
                while True:
                    output = process.stdout.readline()
                    if output == '' and process.poll() is not None:
                        break
                    if output:
                        line = output.strip()

                        # Skip unwanted technical messages
                        if any(pattern in line for pattern in skip_patterns):
                            continue

                        # Parse progress bars
                        if 'Fetching' in line and '%|' in line:
                            # Extract task type
                            if 'UniProt features' in line:
                                task_key = 'uniprot'
                                task_name = '🔍 Fetching UniProt Features'
                            elif 'taxonomy features' in line:
                                task_key = 'taxonomy'
                                task_name = '🌿 Fetching Taxonomy Features'
                            elif 'InterPro features' in line:
                                task_key = 'interpro'
                                task_name = '🧬 Fetching InterPro Features'
                            else:
                                continue

                            # Create progress bar if not exists
                            if task_key not in progress_widgets:
                                progress_bar = IntProgress(
                                    value=0, min=0, max=100,
                                    description='',
                                    bar_style='info',
                                    style={'bar_color': '#17a2b8'},
                                    layout={'width': '400px'}
                                )
                                task_label = HTMLWidget(value=f"<b>{task_name}</b>")
                                progress_widgets[task_key] = {
                                    'bar': progress_bar,
                                    'label': task_label,
                                    'container': WidgetVBox([task_label, progress_bar])
                                }
                                widget_display(progress_widgets[task_key]['container'])
                                current_tasks.add(task_key)

                            # Extract and update progress
                            percent_match = re.search(r'(\d+)%', line)
                            if percent_match:
                                percent = int(percent_match.group(1))
                                progress_widgets[task_key]['bar'].value = percent

                                # Update color based on completion
                                if percent == 100:
                                    progress_widgets[task_key]['bar'].bar_style = 'success'
                                    progress_widgets[task_key]['label'].value = f"<b>{task_name} ✅</b>"

                # Wait for process to complete
                process.wait()

                if process.returncode == 0:
                    status_widget.value = "<b>✅ ProtSpace bundle generated successfully!</b>"
                    output_full_path = output_dir / output_file_name
                    print(f"\nOutput file created: {output_full_path}")

                    # Store output file path globally for download
                    globals()['protspace_output_file'] = str(output_full_path)
                    print(f"\nReady for download. Proceed to the next cell to download the bundle.")


                    return True
                else:
                    status_widget.value = f"<b>❌ ProtSpace processing failed with return code: {process.returncode}</b>"
                    print(f"\nError details (if any):\n{process.stdout.read()}")
                    return False

            except ImportError:
                # Fallback to simple text output if ipywidgets not available
                print("📊 Processing (progress bars not available)...")
                result = subprocess.run(cmd, capture_output=True, text=True)
                if result.returncode == 0:
                    output_full_path = output_dir / output_file_name
                    print("✅ ProtSpace bundle generated successfully!")
                    globals()['protspace_output_file'] = str(output_full_path)
                    print(f"\nOutput file created: {output_full_path}")
                    print(f"\nReady for download. Proceed to the next cell to download the bundle.")
                    return True
                else:
                    print(f"❌ ProtSpace processing failed:\n{result.stderr}")
                    return False

            except FileNotFoundError:
                print("❌ Error: protspace-local command not found. Please ensure ProtSpace is installed correctly.")
                return False
            except Exception as e:
                print(f"❌ An unexpected error occurred: {str(e)}")
                return False

# Create and display the widget
config_widget = ProtSpaceConfigWidget()
display(config_widget.widget)

In [None]:
# @title 📥 Download ProtSpace Bundle {display-mode: "form"}
# @markdown ### Download your generated ProtSpace bundle file
# @markdown
# @markdown **Instructions:**
# @markdown 1. Run this cell to download your bundle file.
# @markdown 2. Once the download is complete, go to the **ProtSpace Web Viewer**: https://tsenoner.github.io/protspace_web/
# @markdown 3. On the web viewer, upload your downloaded `data.parquetbundle` file (or `data.json` if you selected the legacy format).
# @markdown 4. Explore your protein embeddings with interactive visualizations!

import os
from pathlib import Path
from google.colab import files

print("🔍 Checking for ProtSpace output files...\n")

# Check if output file path variable exists
if 'protspace_output_file' not in globals():
    print("❌ Error: No ProtSpace output file path found.")
    print("Please run the ProtSpace generation cell first, then try again.")
else:
    output_file_path = Path(globals()['protspace_output_file'])

    # Determine which file to download (it's already determined by the previous cell)
    file_to_download = output_file_path
    file_name = output_file_path.name

    # Check if the determined output file exists
    if file_to_download.exists():
        file_size = os.path.getsize(file_to_download)
        size_mb = file_size / (1024 * 1024)

        print(f"✅ Found: {file_name} ({size_mb:.2f} MB)")
        print(f"📦 Starting download...\n")

        try:
            files.download(str(file_to_download))
            print(f"✅ Download complete! Check your browser's download folder.")
            print(f"🌐 Next step: Upload {file_name} to the web viewer:")
            print("   https://tsenoner.github.io/protspace_web/")
        except Exception as e:
            print(f"\n❌ Download failed: {str(e)}")
            print("Please try running this cell again or check your browser settings.")
    else:
        print(f"❌ Error: Output file not found at:")
        print(f"   {output_file_path}")
        print("\nPlease ensure the ProtSpace generation cell ran successfully and created the expected output file.")