In [1]:
import os
import sys
import torch
import numpy as np
from torch_geometric.data import Data
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from atomsurf.protein.create_esm import get_esm_embedding_single, get_esm_embedding_batch
from atomsurf.utils.data_utils import AtomBatch, PreprocessDataset
from atomsurf.utils.python_utils import do_all
from atomsurf.utils.wrappers import DefaultLoader, get_default_model
from atomsurf.tasks.masif_site.preprocess import PreProcessMSDataset
from atomsurf.tasks.masif_site.model import MasifSiteNet
from atomsurf.tasks.masif_site.data_loader import MasifSiteDataset
from atomsurf.utils.data_utils import AtomBatch, PreprocessDataset, pdb_to_surf, pdb_to_graphs

In [2]:
def download_masif_site_dataset(data_dir="data/masif_site"):
    """
    Download the MaSIF site dataset.
    
    Args:
        data_dir: Directory to store the dataset
    """
    os.makedirs(data_dir, exist_ok=True)
    
    # Check if dataset already exists
    if os.path.exists(os.path.join(data_dir, "01-benchmark_pdbs")):
        print(f"Dataset already exists in {data_dir}")
        return
    
    print("Downloading MaSIF site dataset...")

    os.system(f"wget -O masif_site_masif_search_pdbs_and_ply_files.tar.gz https://zenodo.org/records/2625420/files/masif_site_masif_search_pdbs_and_ply_files.tar.gz")
    os.system(f"tar -xzf masif_site_masif_search_pdbs_and_ply_files.tar.gz -C {data_dir}")


In [3]:
def setup_directories():
    """Set up necessary directories for data processing and results."""
    data_dir = "data/masif_site"
    benchmark_pdb_dir = os.path.join(data_dir, "pdb")
    surface_dir = os.path.join(data_dir, "surfaces")
    rgraph_dir = os.path.join(data_dir, "rgraph")
    esm_dir = os.path.join(data_dir, "esm_emb")

    # Create output directories if they don't exist
    os.makedirs(surface_dir, exist_ok=True)
    os.makedirs(rgraph_dir, exist_ok=True)
    os.makedirs(esm_dir, exist_ok=True)

    return data_dir, benchmark_pdb_dir, surface_dir, rgraph_dir, esm_dir


In [4]:
def preprocess_data(data_dir, pdb_dir, esm_dir):
    """Preprocess the data including surface generation and ESM embeddings."""
    print("Starting data preprocessing...")
    
    # Initialize the preprocessing dataset
    dataset = PreProcessMSDataset(
        data_dir=data_dir,
        recompute_s=True,  # Set to True to recompute surfaces
        recompute_g=True,  # Set to True to recompute graphs
        face_reduction_rate=0.5,  # Adjust this value to control mesh resolution
        use_pymesh=False
    )

    # Run preprocessing
    print("Processing surfaces and graphs...")
    do_all(dataset, num_workers=4)  # Adjust number of workers based on your system

    # Generate ESM embeddings
    print("Generating ESM embeddings...")
    get_esm_embedding_batch(in_pdbs_dir=pdb_dir, dump_dir=esm_dir)
    
    print("Preprocessing complete!")

### Main Script

In [5]:
#download_masif_site_dataset()

In [6]:
data_dir, pdb_dir, surface_dir, rgraph_dir, esm_dir = setup_directories()

In [7]:
preprocess_data(data_dir, pdb_dir, esm_dir)

Starting data preprocessing...


FileNotFoundError: [Errno 2] No such file or directory: 'data/masif_site/01-benchmark_pdbs'