In [None]:
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment, Datastore
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies
import azureml._restclient.snapshots_client
from azureml.fsspec import AzureMachineLearningFileSystem
import mlflow

In [None]:
from azure.ai.ml import MLClient#, Input, command
from azure.identity import DefaultAzureCredential
import sys
sys.path.append("..")
from utils import azure_ml_configs

workspace_id =  azure_ml_configs.workspace_id
subscription_id = azure_ml_configs.subscription_id
resource_group = azure_ml_configs.resource_group
workspace_name = azure_ml_configs.workspace_name

# Get a handle to the workspace
ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)

In [None]:
#data_asset = ml_client.data.get(name="clinicalNote44M_pretraindata_randompart1", version=2) 

#data_asset = ml_client.data.get(name="clinicalNote44M_pretraindata_randompart2", version=2) 

#data_asset = ml_client.data.get(name="clinicalNote44M_pretraindata_randompart3", version=2) 

data_asset = ml_client.data.get(name="clinicalNote44M_pretraindata_randompart4", version=2) 
print(f"Data asset URI: {data_asset.path}")

In [None]:
# get workspace
ws = Workspace.from_config()

# Create an Experiment
experiment_name = 'mlm-jobs'
experiment = Experiment(workspace=ws, name=experiment_name)

# Set the desired snapshot size (in bytes)
snapshot_size = 10073741824
# Update the maximum snapshot size
azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = snapshot_size


# Get the Curated Environment
curated_env = Environment.get(workspace=ws, name="AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu") #"AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu"


command = "pip install -r mlm_requirements.txt && accelerate launch --multi_gpu --mixed_precision 'fp16' --num_processes 4 \
run_mlm_no_trainer.py \
--model_name_or_path 'psyroberta/psyroberta_part3' \
--train_file '{}' \
--azure_data_asset \
--max_seq_length 512 \
--validation_split_percentage 5 \
--per_device_train_batch_size 64 \
--per_device_eval_batch_size 64 \
--seed 21 \
--output_dir './outputs' \
--with_tracking \
--report_to 'mlflow' \
--log_dir './logs' \
--checkpointing_steps 2000 \
--num_train_epochs 3".format(data_asset.path)

#--num_warmup_steps 1000 \

print(command)

# --model_name_or_path # was first 'psyroberta/psyroberta_part1', then 'psyroberta/psyroberta_part2' and lastly 'psyroberta/psyroberta_part3', which, after training, creates the final psyroberta_part4

# for validation, use either
# --validation_split_percentage
# or
# --validation_file val_data_path 

# argument "checkpointing_steps" can be "epoch" or a digit signifying for which n steps a checkpoint should be saved.



# set up script run configuration
config = ScriptRunConfig(
    source_directory='.',
    command=command,
    compute_target='Terne4A100',
    environment=curated_env
)

In [None]:
# submit script to AML
Run = experiment.submit(config)
print(Run.get_portal_url()) # link to ml.azure.com
Run.wait_for_completion(show_output=True)