In [25]:
import time
import ray
import psutil
import torch
from ray.train.torch import TorchTrainer
from ray.air import ScalingConfig

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Define the training function
@ray.remote
def train_fn(config):
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset

    # Simulate data size
    data_size = config["data_size"]
    batch_size = config.get("batch_size", 64)

    # Generate synthetic data (X: features, y: labels)
    X = torch.randn(data_size, 10)  # 10 features
    y = torch.randint(0, 2, (data_size,))

    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = nn.Sequential(
        nn.Linear(10, 16),
        nn.ReLU(),
        nn.Linear(16, 2)  # Output layer for 2 classes
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training loop (5 epochs for better evaluation)
    epochs = 5
    for epoch in range(epochs):
        total_loss = 0.0
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    return {"final_loss": total_loss}

# Dataset size for testing scalability (starting with smaller size)
datasets = [10_000, 50_000]  # Simulate larger dataset sizes for better scaling evaluation

# Varying number of workers (GPUs/Nodes)
num_workers_list = [1, 2, 4, 8]  # Number of workers to simulate scaling

# Loop through datasets and train with different numbers of workers
for size in datasets:
    for num_workers in num_workers_list:
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss  # Track memory before training

        # Configure the trainer (Using varying number of workers)
        trainer = TorchTrainer(
            train_loop_per_worker=train_fn,
            scaling_config=ScalingConfig(num_workers=num_workers),
            train_loop_config={"data_size": size}
        )

        # Train and get results
        results = trainer.fit()

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss  # Track memory after training

        training_time = end_time - start_time
        memory_usage = (end_memory - start_memory) / (1024 ** 2)  # Convert to MB

        print(f"Dataset size: {size}, Workers: {num_workers}, Training time: {training_time:.2f} seconds, Memory usage: {memory_usage:.2f} MB")

        # Optional: Measure GPU utilization
        if torch.cuda.is_available():
            gpu_usage = torch.cuda.memory_allocated() / (1024 ** 2)  # MB
            print(f"GPU memory used: {gpu_usage:.2f} MB")

# Shut down Ray
ray.shutdown()


2025-01-02 00:34:29,224	INFO worker.py:1652 -- Calling ray.init() again after it has already been called.
2025-01-02 00:34:29,445	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-01-02 00:34:29 (running for 00:00:00.16)
Using FIFO scheduling algorithm.
Logical resource usage: 0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-34-29/TorchTrainer_2025-01-02_00-34-29/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-01-02 00:34:34 (running for 00:00:05.21)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-34-29/TorchTrainer_2025-01-02_00-34-29/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-01-02 00:34:39 (running for 00:00:10.29)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-34-29/TorchTra

2025-01-02 00:34:53,471	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_6b0ec_00000
Traceback (most recent call last):
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\white\AppData\Local\Programs\Python\Python312\Lib\site-packages\ray\_private\worker.py", line 2753, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                         

== Status ==
Current time: 2025-01-02 00:34:53 (running for 00:00:24.06)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/4 CPUs, 0/0 GPUs
Result logdir: C:/Users/white/AppData/Local/Temp/ray/session_2025-01-02_00-00-56_174621_6956/artifacts/2025-01-02_00-34-29/TorchTrainer_2025-01-02_00-34-29/driver_artifacts
Number of trials: 1/1 (1 ERROR)
Number of errored trials: 1
+--------------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               |   # failures | error file                                                                                                                                                                                                             |
|--------------------------+--------------+---------------------------------------------------

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("C:/Users/white/ray_results/TorchTrainer_2025-01-02_00-34-29")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [4]:
pip install torch --upgrade

Note: you may need to restart the kernel to use updated packages.


In [9]:
!git add .



In [10]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   Untitled2.ipynb



In [20]:
!git config --global core.autocrlf true

In [21]:
!git add .

In [22]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   Untitled2.ipynb



In [19]:
!git add .



In [23]:
!git commit -m "Updated training with more memory usage"

[main ce4c94b] Updated training with more memory usage
 1 file changed, 264 insertions(+), 182 deletions(-)


In [24]:
!git push origin main


To https://github.com/whitemr7/Ray.git
   eec3054..ce4c94b  main -> main
