In [1]:
import time
import ray
import psutil
from ray.train.torch import TorchTrainer
from ray.air import ScalingConfig

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Define the training function
@ray.remote
def train_fn(config):
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset

    # Simulate data size
    data_size = config["data_size"]
    batch_size = config.get("batch_size", 64)

    # Generate synthetic data (X: features, y: labels)
    X = torch.randn(data_size, 10)  # 10 features
    y = torch.randint(0, 2, (data_size,))

    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = nn.Sequential(
        nn.Linear(10, 16),
        nn.ReLU(),
        nn.Linear(16, 2)  # Output layer for 2 classes
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training loop (1 epoch for faster execution)
    epochs = 2
    for epoch in range(epochs):
        total_loss = 0.0
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    return {"final_loss": total_loss}

# Dataset sizes for testing scalability (smaller size for quicker testing)
datasets = [1_000]  # Starting with a smaller dataset size for faster execution

# Loop through datasets and train
for size in datasets:
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss  # Track memory before training
    
    # Configure the trainer (Using fewer workers, num_workers=1)
    trainer = TorchTrainer(
        train_loop_per_worker=train_fn,
        scaling_config=ScalingConfig(num_workers=1),
        train_loop_config={"data_size": size}
    )

    # Train and get results
    results = trainer.fit()
    
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss  # Track memory after training
    
    training_time = end_time - start_time
    memory_usage = (end_memory - start_memory) / (1024 ** 2)  # Convert to MB
    
    print(f"Dataset size: {size}, Training time: {training_time:.2f} seconds, Memory usage: {memory_usage:.2f} MB")

# Shut down Ray
ray.shutdown()


2025-01-02 00:01:01,026	INFO worker.py:1819 -- Started a local Ray instance.
2025-01-02 00:01:06,545	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-01-02 00:01:06 (running for 00:00:00.32)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-01-06/TorchTrainer_2025-01-02_00-01-06/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-01-02 00:01:12 (running for 00:00:05.38)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-01-06/TorchTrainer_2025-01-02_00-01-06/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-01-02 00:01:17 (running for 00:00:10.46)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-01-06/TorchT

[36m(RayTrainWorker pid=8096)[0m Setting up process group for: env:// [rank=0, world_size=1]
2025-01-02 00:01:42,210	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_c13cb_00000
Traceback (most recent call last):
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\worker.py", line 2753, in get
    values, d

== Status ==
Current time: 2025-01-02 00:01:42 (running for 00:00:35.62)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-01-06/TorchTrainer_2025-01-02_00-01-06/driver_artifacts
Number of trials: 1/1 (1 ERROR)
Number of errored trials: 1
+--------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               |   # failures | error file                                                                                                                                                                                                             |
|--------------------------+--------------+---------------------------------------------------

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("C:/Users/white/ray_results/TorchTrainer_2025-01-02_00-01-06")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [4]:
pip install torch --upgrade

Note: you may need to restart the kernel to use updated packages.


In [9]:
!git add .



In [10]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   Untitled2.ipynb



In [13]:
!git config --global core.autocrlf true

In [14]:
!git add .

In [15]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   Untitled2.ipynb



In [18]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Untitled2.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git commit -m "Updated training with more memory usage"