# news_aggregator_training



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = 'btceu4s17ranu1'
os.environ['DataZoneDomainId'] = 'dzd-4kbjtzjqm94pk9'
os.environ['DataZoneEnvironmentId'] = '4jlwpldxvx9d49'
os.environ['DataZoneDomainRegion'] = 'us-east-2'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "btceu4s17ranu1",
                "DataZoneDomainId": "dzd-4kbjtzjqm94pk9",
                "DataZoneEnvironmentId": "4jlwpldxvx9d49",
                "DataZoneDomainRegion": "us-east-2",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import torch
import sagemaker
import transformers
from sagemaker.huggingface import HuggingFace

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml


sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


  from .autonotebook import tqdm as notebook_tqdm


In [0]:
role = sagemaker.get_execution_role()

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
sagemaker_session = sagemaker.Session()

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
estimator = HuggingFace(
    entry_point='script.py',
    source_dir='./',
    role=role,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    transformers_version='4.56',
    pytorch_version='2.8',
    output_path='s3://news-aggregator-sadrian-bucket/models/',
    py_version='py312',
    hyperparameters={
        'epochs': 2,
        'train_batch_size': 8,
        'valid_batch_size': 16,
        'learning_rate': 3e-05,
        'seq_max_len': 256
    },
    enable_sagemaker_metrics=True
)

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [0]:
estimator.fit()

2026-01-31 08:34:51 Starting - Starting the training job
2026-01-31 08:34:51 Pending - Training job waiting for capacity.

.

.


2026-01-31 08:35:14 Pending - Preparing the instances for training.

.

.


2026-01-31 08:35:38 Downloading - Downloading input data.

.

.


2026-01-31 08:35:59 Downloading - Downloading the training image.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.

.


2026-01-31 08:42:23 Training - Training image download completed. Training in progress..

[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 575.57.08[0m
[34mCurrent installed NVIDIA driver version is 570.195.03[0m
[34mAdding CUDA compat to LD_LIBRARY_PATH[0m
[34m/usr/local/cuda/compat:/usr/local/cuda/compat:/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/cuda/lib64:/opt/amazon/ofi-nccl/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/lib64[0m
[34m2026-01-31 08:42:28,917 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2026-01-31 08:42:28,936 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2026-01-31 08:42:28,958 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.

[34mstart script.py[0m
[34mtrain test split performed[0m
[34mFull dataset:(422419, 2)[0m
[34mtrain dataset:(337935, 2)[0m
[34mvalidaiton dataset:(84484, 2)[0m
[34mSuccessfully labelled data[0m


[34mdata loaders created successfully[0m
[34mStarting training process[0m
[34mstarting Epoch 0...[0m
[34mTraining loss: 1.379422903060913[0m
[34mTraining Accuracy: 0.25[0m


[34mTraining loss: 0.34724444523279124[0m
[34mTraining Accuracy: 0.8975204959008198[0m


[34mTraining loss: 0.3043999723053315[0m
[34mTraining Accuracy: 0.9137086291370863[0m


[34mTraining loss: 0.28034263116651315[0m
[34mTraining Accuracy: 0.9214219052063196[0m


[34mTraining loss: 0.26461724570024536[0m
[34mTraining Accuracy: 0.926522423878806[0m


[34mTraining loss: 0.2534732036381933[0m
[34mTraining Accuracy: 0.9304027838886445[0m


[34mTraining loss: 0.24520456982691113[0m
[34mTraining Accuracy: 0.9331563947868404[0m


[34mTraining loss: 0.23845821103507697[0m
[34mTraining Accuracy: 0.9354875574983572[0m


[34mTraining loss: 0.23222046462355272[0m
[34mTraining Accuracy: 0.9374234394140146[0m


[34mEpoch 0 Training loss: 0.22934037877846197[0m
[34mEpoch 0 Training Accuracy: 0.9382810303756639[0m
[34mValidation loss: 0.023164115846157074[0m
[34mValidation Accuracy: 1.0[0m


[34mValidation loss: 0.22092097906297986[0m
[34mValidation Accuracy: 0.9483641358641358[0m


[34mValidation loss: 0.21438807112395022[0m
[34mValidation Accuracy: 0.9512743628185907[0m


[34mValidation loss: 0.21647565687453366[0m
[34mValidation Accuracy: 0.9508913695434855[0m


[34mValidation loss: 0.2129951346441501[0m
[34mValidation Accuracy: 0.9510122469382655[0m


[34mValidation loss: 0.21147583359973715[0m
[34mValidation Accuracy: 0.9514722055588882[0m


[34mEpoch 0 Validation loss: 0.20851799149994496[0m
[34mEpoch 0 Validation Accuracy: 0.9522158041759387[0m
[34mstarting Epoch 1...[0m
[34mTraining loss: 0.5670244693756104[0m
[34mTraining Accuracy: 0.875[0m


[34mTraining loss: 0.14690606354755775[0m
[34mTraining Accuracy: 0.9643321335732853[0m


[34mTraining loss: 0.15156864478660526[0m
[34mTraining Accuracy: 0.9626037396260374[0m


[34mTraining loss: 0.15228880560099772[0m
[34mTraining Accuracy: 0.9626941537230851[0m


[34mTraining loss: 0.15330176664198963[0m
[34mTraining Accuracy: 0.9622768861556922[0m


[34mTraining loss: 0.1541592426778514[0m
[34mTraining Accuracy: 0.9621415143394264[0m


[34mTraining loss: 0.15383933485560594[0m
[34mTraining Accuracy: 0.9621595946801773[0m


[34mTraining loss: 0.15415310622334638[0m
[34mTraining Accuracy: 0.9622725064998143[0m


[34mTraining loss: 0.15517101290908425[0m
[34mTraining Accuracy: 0.9621384465388365[0m


[34mEpoch 1 Training loss: 0.15547880318052817[0m
[34mEpoch 1 Training Accuracy: 0.9620400372852768[0m
[34mValidation loss: 0.0054789381101727486[0m
[34mValidation Accuracy: 1.0[0m


[34mValidation loss: 0.201112313610938[0m
[34mValidation Accuracy: 0.9551073926073926[0m


[34mValidation loss: 0.19454104801328578[0m
[34mValidation Accuracy: 0.9566779110444777[0m


[34mValidation loss: 0.19697986367471698[0m
[34mValidation Accuracy: 0.9556189603465511[0m


[34mValidation loss: 0.1927970749334921[0m
[34mValidation Accuracy: 0.9556829542614347[0m


[34mValidation loss: 0.19146204434053612[0m
[34mValidation Accuracy: 0.9564337132573485[0m


[34mEpoch 1 Validation loss: 0.1888559997648343[0m
[34mEpoch 1 Validation Accuracy: 0.9571043037734955[0m
[34mmodel successfully trained[0m
[34msaving model into s3[0m
[34msucessfully ended script.py[0m
[34m2026-01-31 12:55:12,793 sagemaker-training-toolkit INFO     Waiting for the process to finish and give a return code.[0m
[34m2026-01-31 12:55:12,793 sagemaker-training-toolkit INFO     Done waiting for a return code. Received 0 from exiting process.[0m
[34m2026-01-31 12:55:12,794 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m



2026-01-31 12:55:35 Uploading - Uploading generated training model
2026-01-31 12:55:35 Completed - Training job completed


Training seconds: 15596
Billable seconds: 15596


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()