# Train a model

We need to save our W&B credentials on the folder that will be copied to the SM instance

In [2]:
!pip install -Uqqq transformers accelerate "datasets[s3]==2.13.0" sagemaker wandb

In [4]:
import wandb
wandb.sagemaker_auth(path="scripts")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


## SageMaker auth

In [19]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::372108735839:role/SageMakerExecutionRole
sagemaker bucket: sagemaker-us-east-1-372108735839
sagemaker session region: us-east-1


## Loading the data from W&B

In [7]:
import wandb
from datasets import load_from_disk

In [14]:
def load_from_artifact(at_address, at_type="dataset"):
    "Load a HF dataset from a W&B artifact"
    artifact = wandb.use_artifact(at_address, type=at_type)
    artifact_dir = artifact.download()
    return load_from_disk(artifact_dir)

In [17]:
AT_ADDRESS = 'capecape/aws_llm_demo/wandbot_dataset:v2'
load_from_artifact(AT_ADDRESS)

[34m[1mwandb[0m:   3 of 3 files downloaded.  


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2396
})

In [32]:
DATASET_S3 = "s3://sagemaker-us-east-1-372108735839/processed/wandbot/train/"

## Train time! üöÇ

In [20]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name 
MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"
job_name = f'wandb-qlora-codellama7'

lr = 2e-4

# hyperparameters, which are passed into the training job
hyperparameters ={
    'model_id': MODEL_NAME,                           # pre-trained model
    # 'dataset_artifact': AT_ADDRESS,                   # Artifact containing the dataset at W&B
    'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
    'epochs': 1,                                      # number of training epochs
    'per_device_train_batch_size': 2,                 # batch size for training
    'lr': lr,                                         # learning rate used during training
    'hf_token': HfFolder.get_token(),                 # huggingface token to access llama 2
    'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
    'report_to': "wandb",                              # report to wandb
    'wandb_project': "aws_llm_demo",
    "run_name":  f"{MODEL_NAME}__qlora",
}
    
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_clm.py',      # train script
    source_dir           = 'scripts',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache"}, # set env variable to cache models in /tmp
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [34]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': DATASET_S3}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: wandb-qlora-codellama7-2023-10-12-15-46-42-340


Using provided s3_resource
2023-10-12 15:46:42 Starting - Starting the training job...
2023-10-12 15:46:58 Starting - Preparing the instances for training......
2023-10-12 15:48:08 Downloading - Downloading input data...
2023-10-12 15:48:33 Training - Downloading the training image...............
2023-10-12 15:51:09 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-12 15:51:56,497 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-12 15:51:56,510 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-10-12 15:51:56,519 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-12 15:51:56,520 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[

UnexpectedStatusException: Error for Training job wandb-qlora-codellama7-2023-10-12-15-46-42-340: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "raise FileNotFoundError(f"Directory {dataset_path} not found")
 FileNotFoundError: Directory lm_dataset not found"
Command "/opt/conda/bin/python3.10 run_clm.py --dataset_artifact capecape/aws_llm_demo/wandbot_dataset:v2 --epochs 1 --hf_token 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 --lr 0.0002 --merge_weights True --model_id codellama/CodeLlama-7b-Instruct-hf --per_device_train_batch_size 2 --report_to wandb --run_name codellama/CodeLlama-7b-Instruct-hf__qlora --wandb_project aws_llm_demo", exit code: 1

