aws
diff --git a/‎ build_and_train_models/README.md
Lines changed: 1 addition & 0 deletions b/‎ build_and_train_models/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/custom_script.py
Lines changed: 146 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/custom_script.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/test/x_test.npy
19 KB b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/test/x_test.npy
19 KB
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/test/y_test.npy
2.48 KB b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/test/y_test.npy
2.48 KB
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/train/x_train.npy
75.4 KB b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/train/x_train.npy
75.4 KB
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/train/y_train.npy
9.53 KB b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/data/train/y_train.npy
9.53 KB
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/local_training_script.py
Lines changed: 148 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/local_training_script.py
Lines changed: 148 additions & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/pytorch_model_def.py
Lines changed: 23 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/pytorch_model_def.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/basic-script-mode/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/basic-script-mode/requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/distributed-training/scripts/merge_peft_adapters.py
Lines changed: 46 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/distributed-training/scripts/merge_peft_adapters.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎ build_and_train_models/sm-model_trainer/distributed-training/scripts/requirements.txt
Lines changed: 7 additions & 0 deletions b/‎ build_and_train_models/sm-model_trainer/distributed-training/scripts/requirements.txt
Lines changed: 7 additions & 0 deletions
@@ -4,6 +4,7 @@
 
 The example notebooks within this folder showcase the capabilities of Amazon SageMaker in building and training machine learning models.
 
+- [ModelTrainer - New and Improved Training Interface for the SageMaker PySDK](sm-model_trainer/model_trainer_overview.ipynb)
 - [Visualize Training Jobs and Performance of Your Model Using TensorBoard on SageMaker](sm-distributed_data_parallelism_pytorch/sm-distributed_data_parallelism_pytorch.ipynb)
 - [Use SageMaker Distributed Model Parallel with Amazon SageMaker to Launch Training Job with Model Parallelization](sm-distributed_model_parallel/sm-distributed_model_parallel.ipynb)
 - [Time Series Modeling with Amazon Forecast and DeepAR on SageMaker - DeepAR on SageMaker](sm-forecast_deepar_time_series_modeling/sm-forecast_deepar_time_series_modeling.ipynb)
 
@@ -0,0 +1,146 @@
+# flake8: noqa
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+    # Directories: train, test and model
+    train_dir = os.path.join(current_dir, "data/train")
+    test_dir = os.path.join(current_dir, "data/test")
+    model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
+
+    # Load the training and testing data
+    x_train, y_train = get_train_data(train_dir)
+    x_test, y_test = get_test_data(test_dir)
+    train_ds = TensorDataset(x_train, y_train)
+
+    # Training parameters - used to configure the training loop
+    batch_size = 64
+    epochs = 1
+    learning_rate = 0.1
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    # Define the model, loss function and optimizer
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # Test the model
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    # Save the model
+    os.makedirs(model_dir, exist_ok=True)
+    torch.save(model.state_dict(), model_dir + "/model.pth")
+    inference_code_path = model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
+    shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
+    shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+    print("Running the training job ...\n")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
@@ -0,0 +1,148 @@
+# flake8: noqa
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+current_dir = os.path.dirname(os.path.abspath(__file__))
+data_dir = "/opt/ml/input/data"
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+    # Directories: train, test and model
+    train_dir = os.path.join(data_dir, "train")
+    test_dir = os.path.join(data_dir, "test")
+    model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
+
+    # Load the training and testing data
+    x_train, y_train = get_train_data(train_dir)
+    x_test, y_test = get_test_data(test_dir)
+    train_ds = TensorDataset(x_train, y_train)
+
+    # Training parameters - used to configure the training loop
+    batch_size = 64
+    epochs = 1
+    learning_rate = 0.1
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    # Define the model, loss function and optimizer
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # Test the model
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    # Save the model
+    os.makedirs(model_dir, exist_ok=True)
+    torch.save(model.state_dict(), model_dir + "/model.pth")
+    inference_code_path = model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
+    shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
+    shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+    print("Running the training job ...\n")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
@@ -0,0 +1,23 @@
+# flake8: noqa
+import torch
+import torch.nn as nn
+
+
+class NeuralNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(8, 8)
+        self.fc2 = nn.Linear(8, 6)
+        self.fc3 = nn.Linear(6, 1)
+
+    def forward(self, x):
+        x = torch.tanh(self.fc1(x))
+        x = torch.sigmoid(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def get_model():
+
+    model = NeuralNet()
+    return model
@@ -0,0 +1,3 @@
+numpy
+-f https://download.pytorch.org/whl/torch_stable.html
+torch==2.0.1+cpu
@@ -0,0 +1,46 @@
+# flake8: noqa
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+import torch
+
+import os
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_name_or_path", type=str, default="bigcode/starcoderbase-7b")
+    parser.add_argument("--peft_model_path", type=str, default="/")
+    parser.add_argument("--push_to_hub", action="store_true", default=True)
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16
+    )
+
+    model = PeftModel.from_pretrained(base_model, args.peft_model_path)
+    model = model.merge_and_unload()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_or_path)
+
+    if args.push_to_hub:
+        print(f"Saving to hub ...")
+        model.push_to_hub(
+            f"{args.base_model_name_or_path}-merged", use_temp_dir=False, private=True
+        )
+        tokenizer.push_to_hub(
+            f"{args.base_model_name_or_path}-merged", use_temp_dir=False, private=True
+        )
+    else:
+        model.save_pretrained(f"{args.base_model_name_or_path}-merged")
+        tokenizer.save_pretrained(f"{args.base_model_name_or_path}-merged")
+        print(f"Model saved to {args.base_model_name_or_path}-merged")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,7 @@
+transformers==4.31
+datasets==2.16.1
+fsspec==2023.6.0
+accelerate>=0.21
+tokenizers>=0.13.3
+bitsandbytes
+peft
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+numpy`
	`2`	`+-f https://download.pytorch.org/whl/torch_stable.html`
	`3`	`+torch==2.0.1+cpu`