# Run Diffdock NIM on SageMaker JupyterLab App

In [2]:
import boto3, requests

## Initiate boto3 session

In [3]:
session = boto3.Session(
    aws_access_key_id='my_access_key_id',
    aws_secret_access_key='my_secret_access_key',
    aws_session_token='my_session_token'
)

In [1]:
# Test connection by listing buckets
# s3_client = session.client('s3')
# s3_client.list_buckets()

## Create presigned domain URL

In [4]:
def create_presigned_domain_url(domain_id, user_profile_name, space_name, seession_expiration_duration_in_seconds=43200, expires_in_seconds=300):

    """
    Create a presigned domain URL for a SageMaker domain
    :param domain_id: str, the ID of the SageMaker domain
    :param user_profile_name: str, the name of the user profile
    :param space_name: str, the name of the space (JupyterLab App)
    :param seession_expiration_duration_in_seconds: int, the duration of the session expiration, default is 43200 seconds (12 hours)
    :param expires_in_seconds: int, the duration of the presigned URL expiration, default is 300 seconds (5 minutes)
    :return: str, the presigned domain URL, or None if an error occurs
    """
    # Initialize a SageMaker client
    sagemaker_client = boto3.client('sagemaker')

    try: 
    # Create the presigned domain URL
        response = sagemaker_client.create_presigned_domain_url(
            DomainId=domain_id,
            UserProfileName=user_profile_name,
            SpaceName=space_name, 
            SessionExpirationDurationInSeconds=seession_expiration_duration_in_seconds, 
            ExpiresInSeconds=expires_in_seconds
        )
        presigned_url = response.get('AuthorizedUrl', None)
        return presigned_url  
    except Exception as e:
        print(f"Error creating presigned domain URL: {e}")
        return None

In [5]:
# Example usage
domain_id = 'd-d9a41j1d4had'
user_profile_name = 'xiyu'
space_name = 'xyu-bionemo-nim' # name of the Jupyterlab app

presigned_url = create_presigned_domain_url(domain_id, user_profile_name, space_name)   
print(f"{presigned_url}")

https://swbf6c9wfmnzhag.studio.us-east-1.sagemaker.aws/auth?token=eyJhbGciOiJIUzI1NiJ9.eyJleHBpcmVzSW5TZWNvbmRzIjozMDAsInNwYWNlTmFtZSI6Inh5dS1iaW9uZW1vLW5pbSIsImZhc0NyZWRlbnRpYWxzIjoiQVlBRGVFQkJ2RjdmTzlqcHkzUkVBT2FQZkVnQVh3QUJBQlZoZDNNdFkzSjVjSFJ2TFhCMVlteHBZeTFyWlhrQVJFRndOMWh0YVdwTE0waFRka2wwTmxKbWFHeHBaVkpuWlZOMlVUVjRZemhVYkZRNFFWQnlhalpVWkZKeGVuRTNabTF6ZFVReVJrdDZjU3MxV0ZSQlZGQjRVVDA5QUFFQUIyRjNjeTFyYlhNQVMyRnlianBoZDNNNmEyMXpPblZ6TFdWaGMzUXRNVG81T0RBek5USTRNall4TVRVNmEyVjVMMkU0T1RneVptVTRMVEUzTnpjdE5HSTBaaTA0T1RVNUxXTTJOV014T1dJeFlqTXhNQUM0QVFJQkFIZ25sWHBCUkovaDhtdjh6TXBjb3ZTQVJaTUg2Ly9tMEJjaXFFVjBuc01ERVFGOE1GQnlzNnljWFJlenVLblFSR1lKQUFBQWZqQjhCZ2txaGtpRzl3MEJCd2FnYnpCdEFnRUFNR2dHQ1NxR1NJYjNEUUVIQVRBZUJnbGdoa2dCWlFNRUFTNHdFUVFNSXpDMkdPTlg1NE56c0h5ZkFnRVFnRHNzV21ydjUrT0hqR0c5ejZDRWU2SVd1ZGJXcFVQSlF4ekxVUG84VklYYWwvbHZ2cVMxSFJvZzk4ZkhSamIwdmZOOGRTUUF6NjJ4aGZYbDBRSUFBQUFBREFBQUVBQUFBQUFBQUFBQUFBQUFBQURpOFNTRlR4ZjQ4a0poR3JBZ25KVHMvLy8vL3dBQUFBRUFBQUFBQUFBQUFBQUFBQUVBQUFPZjFaZ1EvbzBHd

In [6]:
# create a session with the presigned URL
session = requests.Session()
response = session.get(presigned_url)
assert response.status_code == 200


## Build urls

First, go to the running JuptyerLab. Find the `sagemaker-space-id` on your browser. It looks something like this: 

```
https://<sagemaker-space-id>.studio.us-east-2.sagemaker.aws/jupyterlab/default/
```

![](../images/jupyterlab-app-url-example.png)

In [7]:
sagemaker_space_id = 'swbf6c9wfmnzhag' # replace with your space id
base_url = f'https://{sagemaker_space_id}.studio.us-east-1.sagemaker.aws/jupyterlab/default/proxy/8000'
health_check_url = f'{base_url}/v1/health/ready'
query_url = f'{base_url}/molecular-docking/diffdock/generate'

## Run health check remotely

In [8]:
response = session.get(health_check_url)
assert response.text == "true", "Health check failed"

## Submit query remotely

In [9]:
def submit_query(protein_file_path, ligand_file_path, num_poses=20, time_divisions=20, steps=18):
    """
    Submit a query to the server
    :param protein_file_path: path to the protein file, must be a PDB file
    :param ligand_file_path: path to the ligand file, must be a txt, SDF, MOL2, file. If using batch-docking, only txt and SDF are supported. 
    :param num_poses: int, number of poses to be generated, default is 20
    :param time_divisions: int, number of time divisions, default is 20
    :param steps: int, number of diffusion steps, default is 18
    :return: dict of response, status code and JSON response content if successful, otherwise return status code and error message
    """

    with open(protein_file_path, 'r') as file:
        protein_bytes = file.read()
    with open(ligand_file_path, 'r') as file:
        ligand_bytes = file.read()

    ligand_file_type = ligand_file_path.split('.')[-1]
    
    data = {
        "ligand": ligand_bytes,
        "ligand_file_type": ligand_file_type, # txt, sdf, mol2
        "protein": protein_bytes,
        "num_poses": num_poses,
        "time_divisions": time_divisions,
        "steps": steps,
        "save_trajectory": False, # diffusion trajectory
        "is_staged": False
    }
    
    headers = {"Content-Type": "application/json"}

    response = session.post(query_url, headers=headers, json=data)
    status_code = response.status_code
    try:
        response.raise_for_status() # optional, immediately fails if the status code is not 200
        output = {
            "status_code": status_code,
            "response": response.json()
        }
    except:
        output = {
            "status_code": status_code,
            "response": response.text()   
        }
    
    return output


In [10]:
%%time
result = submit_query(
    protein_file_path="data/batch_input_small/receptor.pdb",
    ligand_file_path="data/batch_input_small/input_smiles.txt",
    num_poses=5, 
    time_divisions=20, 
    steps=18
)

CPU times: user 5.04 ms, sys: 1.96 ms, total: 7 ms
Wall time: 7.84 s


## Analyze results

In [16]:
result.keys()

dict_keys(['status_code', 'response'])

In [17]:
response = result['response']
response.keys()

dict_keys(['trajectory', 'ligand_positions', 'position_confidence', 'status', 'protein', 'ligand'])

In [18]:
# 3 input ligands, all showed success
response['status']

['success', 'success', 'success']

In [19]:
poses = response['ligand_positions']

assert len(poses) == 3 # we have 3 ligandsin the `input_smiles.txt` file. 

# let's look at the first ligand. It has 5 predicted poses
assert len(poses[0]) == 5

# let's look at the first pose of the first ligand. 
poses[0][0]

'protein_ligand_0\n     RDKit          3D\n\n 31 36  0  0  0  0  0  0  0  0999 V2000\n   -8.0381   -4.8383   56.1620 N   0  0  0  0  0  0  0  0  0  0  0  0\n   -8.3271   -6.1432   55.7041 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -9.0236   -7.0513   56.4274 N   0  0  0  0  0  0  0  0  0  0  0  0\n   -9.2966   -8.2848   55.9944 C   0  0  0  0  0  0  0  0  0  0  0  0\n  -10.0287   -9.2136   56.7750 N   0  0  0  0  0  0  0  0  0  0  0  0\n  -10.3223  -10.5127   56.1703 C   0  0  0  0  0  0  0  0  0  0  0  0\n  -10.9089  -11.3569   57.3107 C   0  0  0  0  0  0  0  0  0  0  0  0\n  -12.0309  -10.6637   57.8155 N   0  0  0  0  0  0  0  0  0  0  0  0\n  -13.1384  -11.4212   58.2274 C   0  0  0  0  0  0  0  0  0  0  0  0\n  -13.3743  -11.4900   59.7065 C   0  0  1  0  0  0  0  0  0  0  0  0\n  -13.5281  -12.9604   60.1133 C   0  0  0  0  0  0  0  0  0  0  0  0\n  -14.4159  -13.1156   61.2228 N   0  0  0  0  0  0  0  0  0  0  0  0\n  -14.4055  -14.0703   62.1678 C   0  0  0  0  0  0  0  0  0  

In [20]:
# this is a list of 3 input ligands
assert len(response['position_confidence']) == 3

# each sublist has 5 scores, one score for each pose
assert len(response['position_confidence'][0]) == 5

# here is what the confidence scores of the 1st ligand looks like
response['position_confidence'][0]

[-0.6733115315437317,
 -0.8990067839622498,
 -1.310671329498291,
 -1.9104959964752197,
 -3.0547292232513428]

In [20]:
from rdkit import Chem
from rdkit.Chem import AllChem, SDWriter
import os
import numpy as np
import shutil

# utility function to prepare (start clean) the output directory
def prepare_directory(temp):
    """
    Create a new directory and delete the old one if it exists
    :param temp: str: path to the directory
    """
    if os.path.exists(temp):
        # Remove the directory and all its contents
        shutil.rmtree(temp)
    # Recreate the directory
    os.makedirs(temp)


# change it to your desired output directory. use the `prepare_directory` function to start clean. 
output_dir = "output/batch_output_small" 
prepare_directory(output_dir) 

# output SDF file which has all ligands and all poses
output_sdf_file = os.path.join(output_dir, 'output.sdf') # output file path

# create a writer
writer = SDWriter(output_sdf_file)

# select indices where status is succes
status_array = np.array(response['status'])
success_indices = np.where(status_array == 'success')[0]

for i in success_indices:
    # e.g. ligand ID will be ligand_0, ligand_1, etc
    ligand_id = 'ligand_' + str(i)

    # get the ligand poses and confidence scores
    ligand_positions = response['ligand_positions'][i]
    confidence_scores = response['position_confidence'][i]

    # write to SDF file
    for idx, sdf_str in enumerate(ligand_positions):
        mol = Chem.MolFromMolBlock(sdf_str)
        if mol:
            try: 
                # sanitize the molecule
                Chem.SanitizeMol(mol)
                # set the name, like ligand_0_pose_0, ligand_0_pose_1, etc 
                mol.SetProp("_Name", ligand_id+f'_pose_{str(idx)}')
                # set the confidence score as a property
                mol.SetProp("Confidence", str(np.round(confidence_scores[idx], 4)))
                writer.write(mol)
            except:
                print(f"Failed to sanitize molecule {ligand_id}_pose_{str(idx)}")
                continue


You can now go to `output_dir = "output/batch_output_small" ` and see the output SDF file. 

![](../images/Diffdock_result.png)