# Training Notebook
This notebook demonstrates how to train parler tts models using SageMaker.
* Push the training data to S3
* Train the model
* Spot check the training results

In [None]:
import sagemaker
import boto3
import os

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()
 
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
 
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
 
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Build the Training container
In the terminal, run the following command to build the inference container and push it to ECR>

**Note** In a production environment, you will likely want to build the container in a CI/CD pipeline by executing the script as an action in github, gitlab, or another CI/CD platform.

```bash
$ cd ..
$ . ./script/build-serve-container.sh train
```

Copy the output URI from the terminal output and paste it into the variable `image_uri` below.

In [7]:
image_uri = '<YOUR CONTAINER URI>'

In [None]:
from datetime import datetime
from sagemaker.estimator import Estimator

# Create unique job name with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
job_name = f'parler-tts-training-{timestamp}'

# Configure the estimator with your custom container
# Since your container handles all configs internally, we just need minimal setup
estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.g6.12xlarge',  # GPU instance
    volume_size=128,  # EBS volume size in GB
    max_run=86400,  # Maximum runtime in seconds (24 hours)
    sagemaker_session=sess,
    # Set any environment variables needed for GPU
    environment={
        'NVIDIA_VISIBLE_DEVICES': 'all',
        'CUDA_VISIBLE_DEVICES': '0'
    }
)

# Start the training job with empty inputs since container handles data sourcing
estimator.fit(job_name=job_name, wait=False)

print(f"Training job '{job_name}' started.")
print(f"You can monitor the job in the SageMaker console or using the command:")
print(f"aws sagemaker describe-training-job --training-job-name {job_name}")