# news_aggregator_training



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = 'btceu4s17ranu1'
os.environ['DataZoneDomainId'] = 'dzd-4kbjtzjqm94pk9'
os.environ['DataZoneEnvironmentId'] = '4jlwpldxvx9d49'
os.environ['DataZoneDomainRegion'] = 'us-east-2'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "btceu4s17ranu1",
                "DataZoneDomainId": "dzd-4kbjtzjqm94pk9",
                "DataZoneEnvironmentId": "4jlwpldxvx9d49",
                "DataZoneDomainRegion": "us-east-2",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import torch
import sagemaker
import transformers
from sagemaker.huggingface import HuggingFace

In [0]:
role = sagemaker.get_execution_role()

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
sagemaker_session = sagemaker.Session()

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
estimator = HuggingFace(
    entry_point='test.py',
    source_dir='./',
    role=role,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    transformers_version='4.56',
    pytorch_version='2.8',
    output_path='s3://news-aggregator-sadrian-bucket/models/',
    py_version='py312',
    hyperparameters={
        'epochs': 1,
        'train_batch_size': 2,
        'valid_batch_size': 1,
        'learning_rate': 1e-05,
        'seq_max_len': 32
    },
    enable_sagemaker_metrics=True
)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
estimator.fit()

2026-01-31 02:43:29 Starting - Starting the training job
2026-01-31 02:43:29 Pending - Training job waiting for capacity.

.

.


2026-01-31 02:43:42 Pending - Preparing the instances for training.

.

.


2026-01-31 02:44:27 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.


2026-01-31 02:50:51 Training - Training image download completed. Training in progress..

[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 575.57.08[0m
[34mCurrent installed NVIDIA driver version is 570.195.03[0m
[34mAdding CUDA compat to LD_LIBRARY_PATH[0m
[34m/usr/local/cuda/compat:/usr/local/cuda/compat:/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/lib64[0m
[34m2026-01-31 02:51:02,036 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2026-01-31 02:51:02,056 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2026-01-31 02:51:02,080 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.

[34müöÄstart script.py[0m
[34mepochs: 1, train_batch_size2, valid_batch_size: 1, learning_rate: 1e-05, data_path: news-aggregator-sadrian-bucket/newsCorpora.csv, seq_max_len: 32[0m
[34müîç Environment check[0m
[34mSM_MODEL_DIR: /opt/ml/model[0m
[34mSM_OUTPUT_DATA_DIR: /opt/ml/output/data[0m
[34müî• PyTorch version: 2.8.0+cu129[0m
[34mCUDA available: True[0m
[34müß™Starting dummy training process[0m
[34mstarting Epoch 0...[0m
[34mEpoch 1/1 - loss: 0.2468[0m
[34mmodel successfully trained[0m
[34msaving model into s3[0m
[34m‚úÖ Model saved to /opt/ml/model/smoke_test.bin[0m
[34müéâ test.py completed successfully[0m
[34m2026-01-31 02:51:10,868 sagemaker-training-toolkit INFO     Waiting for the process to finish and give a return code.[0m
[34m2026-01-31 02:51:10,868 sagemaker-training-toolkit INFO     Done waiting for a return code. Received 0 from exiting process.[0m
[34m2026-01-31 02:51:10,868 sagemaker-training-toolkit INFO     Reporting training SU


2026-01-31 02:51:29 Uploading - Uploading generated training model
2026-01-31 02:51:29 Completed - Training job completed


Training seconds: 442
Billable seconds: 442


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()