# SEMA-1D 

SEMA-1D is a fine-tuned ESM-1v model aimed to predict epitope resiudes based on antigen protein sequence

## 1. Set up Environment

In [16]:
%pip install datasets huggingface-hub s3fs=='0.4.2' fair-esm
%pip uninstall tensorflow -y

Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
import boto3
import json
import os
import pandas as pd
import random
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.huggingface import HuggingFace, HuggingFaceModel
from sagemaker.inputs import TrainingInput
from time import strftime

boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)
S3_BUCKET = sagemaker_session.default_bucket()
s3 = boto_session.client("s3")
sagemaker_client = boto_session.client("sagemaker")
REGION_NAME = sagemaker_session.boto_region_name

try:
    sagemaker_execution_role = sagemaker_session.get_execution_role()
except AttributeError:
    NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"
    with open(NOTEBOOK_METADATA_FILE, "rb") as f:
        metadata = json.loads(f.read())
        instance_name = metadata["ResourceName"]
        domain_id = metadata.get("DomainId")
        user_profile_name = metadata.get("UserProfileName")
        space_name = metadata.get("SpaceName")
    domain_desc = sagemaker_session.sagemaker_client.describe_domain(DomainId=domain_id)
    if "DefaultSpaceSettings" in domain_desc:
        sagemaker_execution_role = domain_desc["DefaultSpaceSettings"]["ExecutionRole"]
    else:
        sagemaker_execution_role = domain_desc["DefaultUserSettings"]["ExecutionRole"]

print(f"Assumed SageMaker role is {sagemaker_execution_role}")

S3_PREFIX = "esm2-sema-1d"
S3_PATH = sagemaker.s3.s3_path_join("s3://", S3_BUCKET, S3_PREFIX)
print(f"S3 path is {S3_PATH}")

EXPERIMENT_NAME = "esm2-sema-1d-" + strftime("%Y-%m-%d-%H-%M-%S")
print(f"Experiment name is {EXPERIMENT_NAME}")

Assumed SageMaker role is arn:aws:iam::340752820161:role/service-role/AmazonSageMaker-ExecutionRole-20241011T160996
S3 path is s3://sagemaker-us-east-1-340752820161/esm2-sema-1d
Experiment name is esm2-sema-1d-2024-10-20-01-21-05


## 2. Build Dataset

In [18]:
# SEMA data URLs
train_data_url = 'https://raw.githubusercontent.com/AIRI-Institute/SEMAi/main/epitopes_prediction/data/sema_2.0/train_set.csv'
test_data_url = 'https://raw.githubusercontent.com/AIRI-Institute/SEMAi/main/epitopes_prediction/data/sema_2.0/test_set.csv'

# Download the data locally
train_df = pd.read_csv(train_data_url)
test_df = pd.read_csv(test_data_url)

# Save to local paths
train_local_path = 'train_set.csv'
test_local_path = 'test_set.csv'
train_df.to_csv(train_local_path, index=False)
test_df.to_csv(test_local_path, index=False)

Finally, we upload the processed training, test, and validation data to S3.

In [19]:
# Upload to S3
sagemaker_session.upload_data(path=train_local_path, bucket=S3_BUCKET, key_prefix=f"{S3_PREFIX}/data/train")
sagemaker_session.upload_data(path=test_local_path, bucket=S3_BUCKET, key_prefix=f"{S3_PREFIX}/data/test")


's3://sagemaker-us-east-1-340752820161/esm2-sema-1d/data/test/test_set.csv'

## 3. Train Model in SageMaker

In [20]:
hyperparameters = {
    "epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "learning_rate": 1e-5,
    "warmup_steps": 0,
    "weight_decay": 0.0,
    "gradient_accumulation_steps": 1,
    "seed": 42,
}

metric_definitions = [
    {"Name": "epoch", "Regex": "'epoch': ([0-9.]*)"},
    {
        "Name": "max_gpu_mem",
        "Regex": "Max GPU memory use during training: ([0-9.e-]*) MB",
    },
    {"Name": "train_loss", "Regex": "'loss': ([0-9.e-]*)"},
    {
        "Name": "train_samples_per_second",
        "Regex": "'train_samples_per_second': ([0-9.e-]*)",
    },
    {"Name": "eval_loss", "Regex": "'eval_loss': ([0-9.e-]*)"},
    {"Name": "eval_accuracy", "Regex": "'eval_accuracy': ([0-9.e-]*)"},
]

# Define the HuggingFace Estimator
hf_estimator = HuggingFace(
    base_job_name="esm-2-sema-1d",
    entry_point='train.py',
    source_dir='scripts',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    transformers_version="4.28",
    pytorch_version="2.0",
    py_version="py310",
    output_path=f"{S3_PATH}/output",
    role=sagemaker_execution_role,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    checkpoint_local_path="/opt/ml/checkpoints",
    sagemaker_session=sagemaker_session,
    tags=[{"Key": "project", "Value": "esm-fine-tuning"}],
)

In [21]:
train_s3_uri = S3_PATH + "/data/train/train_set.csv"
test_s3_uri = S3_PATH + "/data/test/test_set.csv"

with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    hf_estimator.fit(
        {
            'train': TrainingInput(s3_data=train_s3_uri, content_type='text/csv'),
            'test': TrainingInput(s3_data=test_s3_uri, content_type='text/csv')
        },
        wait=False,
    )

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: esm-2-sema-1d-2024-10-20-01-21-29-420


You can view metrics and debugging information for this run in SageMaker Experiments.

In [23]:
from sagemaker.analytics import ExperimentAnalytics

training_job_details = hf_estimator.latest_training_job.describe()
print(f"Training job name: {training_job_details.get('TrainingJobName')}")
print(f"Training job status: {training_job_details.get('TrainingJobStatus')}")
print(f"Training job output: {training_job_details.get('ModelArtifacts')}")

search_expression = {
    "Filters": [
        {
            "Name": "DisplayName",
            "Operator": "Contains",
            "Value": "Training",
        }
    ],
}

trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=sagemaker_session,
    experiment_name=EXPERIMENT_NAME,
    search_expression=search_expression,
)

trial_component_analytics.dataframe().T

Training job name: esm-2-sema-1d-2024-10-20-01-21-29-420
Training job status: InProgress
Training job output: None


Unnamed: 0,0
TrialComponentName,esm-2-sema-1d-2024-10-20-01-21-29-420-aws-trai...
DisplayName,esm-2-sema-1d-2024-10-20-01-21-29-420-aws-trai...
SourceArn,arn:aws:sagemaker:us-east-1:340752820161:train...
SageMaker.ImageUri,763104351884.dkr.ecr.us-east-1.amazonaws.com/h...
SageMaker.InstanceCount,1.0
SageMaker.InstanceType,ml.p3.2xlarge
SageMaker.VolumeSizeInGB,30.0
epochs,1.0
gradient_accumulation_steps,1.0
learning_rate,0.00001


## 4. Deploy Model as Real-Time Inference Endpoint

To deploy our endpoint, we call deploy() on our HuggingFace estimator object, passing in our desired number of instances and instance type.

In [8]:
%%time

predictor = hf_estimator.deploy(initial_instance_count=1, instance_type="ml.r5.2xlarge")

NameError: name 'estimator' is not defined

Try running some known epitopes

In [None]:
# Example sequence input for conformational B-cell epitope prediction (Ara h 2)
sample_sequence = {
    "sequence": "MAKLTILVALALFLLAAHASARQQWELQGDRRCQSQLERANLRPCEQHLMQKIQRDEDSYERDPYSPSQDPYSPSPYDRRGAGSSQHQERCCNELNEFENNQRCMCEALQQIMENQSDRLQGRQQEQQFKRELRNLPQQCGLRAPQRCDLDVESGG"
}

# Send the sequence to the deployed SageMaker predictor for epitope prediction
response = predictor.predict(sample_sequence)

# Print the predicted conformational B-cell epitopes
print(response)


In [None]:
#Epoch	Training Loss	Validation Loss	Pearson R	Mse	R2 Score
#1	0.212700	0.150756	0.251578	0.173891	-0.567424
#2	0.157400	0.165494	0.253576	0.183997	-0.658516


## 5. Clean up

Delete endpoint

In [None]:
try:
    predictor.delete_endpoint()
except:
    pass

Delete S3 data

In [None]:
bucket = boto_session.resource("s3").Bucket(S3_BUCKET)
bucket.objects.filter(Prefix=S3_PREFIX).delete()