In [3]:
# Define the variables in Python
TF_NAME = 'AR'
CELL_LINE = '22Rv1'

In [4]:
%%bash -s "$TF_NAME" "$CELL_LINE"
# $1 and $2 refer to the positional arguments passed to bash from Python
TF_NAME=$1
CELL_LINE=$2

# Upload the script to the remote server
scp -r /Users/wejarrard/projects/tf-binding/src/processing ucsf:/data1/datasets_1/human_cistrome/chip-atlas/peak_calls/tfbinding_scripts/scripts

# Execute commands on the remote server
ssh ucsf <<ENDSSH
    TF_NAME="${TF_NAME}"
    CELL_LINE="${CELL_LINE}"

    cd /data1/datasets_1/human_cistrome/chip-atlas/peak_calls/tfbinding_scripts/scripts/
    source activate processing

    # Run the Python script with provided arguments
    python generate_training_peaks.py "\${TF_NAME}" --balance --validation_cell_lines "\${CELL_LINE}"
ENDSSH

# Download the resulting files
scp ucsf:/data1/datasets_1/human_cistrome/chip-atlas/peak_calls/tfbinding_scripts/scripts/data/transcription_factors/${TF_NAME}/entire_set/validation_combined.csv /Users/wejarrard/projects/tf-binding/data/data_splits
scp ucsf:/data1/datasets_1/human_cistrome/chip-atlas/peak_calls/tfbinding_scripts/scripts/data/transcription_factors/${TF_NAME}/entire_set/training_combined.csv /Users/wejarrard/projects/tf-binding/data/data_splits

Pseudo-terminal will not be allocated because stdin is not a terminal.


2024-10-18 10:46:16,387 - INFO - Validation cell lines: ['22Rv1']
2024-10-18 10:46:18,766 - INFO - MCF-7_AR.bed: 927 positive samples, 257893 negative samples
2024-10-18 10:46:20,391 - INFO - 22RV1_AR.bed: 16434 positive samples, 87752 negative samples
2024-10-18 10:46:20,744 - INFO - NCI-H1437_AR.bed: 82 positive samples, 24251 negative samples
2024-10-18 10:46:21,891 - INFO - A549_AR.bed: 136 positive samples, 145226 negative samples
2024-10-18 10:46:23,127 - INFO - LNCAP_AR.bed: 20860 positive samples, 94160 negative samples
2024-10-18 10:46:25,279 - INFO - VCAP_AR.bed: 32203 positive samples, 231985 negative samples
2024-10-18 10:46:26,111 - INFO - NCI-H2126_AR.bed: 119 positive samples, 99054 negative samples
2024-10-18 10:46:27,363 - INFO - C4-2_AR.bed: 31496 positive samples, 100618 negative samples
2024-10-18 10:46:28,913 - INFO - THP-1_AR.bed: 293 positive samples, 195991 negative samples
2024-10-18 10:46:29,698 - INFO - 42D_AR.bed: 576 positive samples, 83542 negative samples

In [1]:
import boto3
import os
from sagemaker import get_execution_role, Session
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import TensorBoardOutputConfig

# Initialize a SageMaker session with a specified default bucket
sagemaker_session = Session(default_bucket="tf-binding-sites")

role = "arn:aws:iam::016114370410:role/tf-binding-sites"

training_data_s3_path = "s3://tf-binding-sites/pretraining/data/"
prefix = "pretraining/data/"
local_dir = "/Users/wejarrard/projects/tf-binding/data/data_splits"

# Initialize the S3 client
s3 = boto3.client('s3')

# Specify your S3 bucket name
bucket_name = sagemaker_session.default_bucket()

# Upload new files to the specified S3 location
inputs = sagemaker_session.upload_data(path=local_dir, key_prefix=prefix)
print(f"Input spec: {inputs}")


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/wejarrard/Library/Application Support/sagemaker/config.yaml
Input spec: s3://tf-binding-sites/pretraining/data


In [2]:
output_s3_path = "s3://tf-binding-sites/finetuning/results/output"


tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path="s3://tf-binding-sites/finetuning/results/tensorboard",
    container_local_output_path="/opt/ml/output/tensorboard"
)

use_spot_instances = True
max_wait = 1209600 if use_spot_instances else None
checkpoint_s3_bucket="s3://tf-binding-sites/finetuning/results/checkpointing"
checkpoint_local_path="/opt/ml/checkpoints"


estimator = PyTorch(
    base_job_name=f"{TF_NAME}-Full-Data-Model",
    entry_point="multi_tf_prediction.py",
    source_dir="../training",
    output_path=output_s3_path,
    code_location="s3://tf-binding-sites/finetuning/results/code",
    role=role,
    py_version="py310",
    framework_version='2.0.0',
    volume_size=600, 
    instance_count=1,
    max_run=1209600,
    instance_type='ml.g5.8xlarge',
    hyperparameters={
        'learning-rate': 1e-5
    },
    tensorboard_output_config=tensorboard_output_config,
    # use_spot_instances=use_spot_instances,
    # max_wait=max_wait,
    # checkpoint_s3_uri=checkpoint_s3_bucket,
    # checkpoint_local_path=checkpoint_local_path
)

estimator.fit({'training': training_data_s3_path}, wait=False)



INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: AR-Full-Data-Model-2024-10-17-04-20-48-548
