Skip to content

Commit cfd56cd

Browse files
authored
Add ModelTrainer & ModelBuilder example notebooks (#4792)
* Add ModelTrainer & ModelBuilder example notebooks * Update recipes setup * Update Readme for ModelTrainer * Update Readme for ModelBuilder * Remove codeblock
1 parent 00d3763 commit cfd56cd

File tree

17 files changed

+1999
-0
lines changed

17 files changed

+1999
-0
lines changed

build_and_train_models/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
The example notebooks within this folder showcase the capabilities of Amazon SageMaker in building and training machine learning models.
66

7+
- [ModelTrainer - New and Improved Training Interface for the SageMaker PySDK](sm-model_trainer/model_trainer_overview.ipynb)
78
- [Visualize Training Jobs and Performance of Your Model Using TensorBoard on SageMaker](sm-distributed_data_parallelism_pytorch/sm-distributed_data_parallelism_pytorch.ipynb)
89
- [Use SageMaker Distributed Model Parallel with Amazon SageMaker to Launch Training Job with Model Parallelization](sm-distributed_model_parallel/sm-distributed_model_parallel.ipynb)
910
- [Time Series Modeling with Amazon Forecast and DeepAR on SageMaker - DeepAR on SageMaker](sm-forecast_deepar_time_series_modeling/sm-forecast_deepar_time_series_modeling.ipynb)
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# flake8: noqa
2+
import argparse
3+
import numpy as np
4+
import os
5+
import sys
6+
import logging
7+
import json
8+
import shutil
9+
import torch
10+
import torch.nn as nn
11+
from torch.utils.data import DataLoader, TensorDataset
12+
from pytorch_model_def import get_model
13+
14+
logger = logging.getLogger(__name__)
15+
logger.setLevel(logging.DEBUG)
16+
logger.addHandler(logging.StreamHandler(sys.stdout))
17+
current_dir = os.path.dirname(os.path.abspath(__file__))
18+
19+
20+
def get_train_data(train_dir):
21+
"""
22+
Get the training data and convert to tensors
23+
"""
24+
25+
x_train = np.load(os.path.join(train_dir, "x_train.npy"))
26+
y_train = np.load(os.path.join(train_dir, "y_train.npy"))
27+
logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
28+
29+
return torch.from_numpy(x_train), torch.from_numpy(y_train)
30+
31+
32+
def get_test_data(test_dir):
33+
"""
34+
Get the testing data and convert to tensors
35+
"""
36+
37+
x_test = np.load(os.path.join(test_dir, "x_test.npy"))
38+
y_test = np.load(os.path.join(test_dir, "y_test.npy"))
39+
logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
40+
41+
return torch.from_numpy(x_test), torch.from_numpy(y_test)
42+
43+
44+
def model_fn(model_dir):
45+
"""
46+
Load the model for inference
47+
"""
48+
49+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50+
model = get_model()
51+
model.load_state_dict(torch.load(model_dir + "/model.pth"))
52+
model.eval()
53+
return model.to(device)
54+
55+
56+
def input_fn(request_body, request_content_type):
57+
"""
58+
Deserialize and prepare the prediction input
59+
"""
60+
61+
if request_content_type == "application/json":
62+
request = json.loads(request_body)
63+
train_inputs = torch.tensor(request)
64+
return train_inputs
65+
66+
67+
def predict_fn(input_data, model):
68+
"""
69+
Apply model to the incoming request
70+
"""
71+
72+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
73+
model.to(device)
74+
model.eval()
75+
with torch.no_grad():
76+
return model(input_data.float()).numpy()[0]
77+
78+
79+
def train():
80+
"""
81+
Train the PyTorch model
82+
"""
83+
# Directories: train, test and model
84+
train_dir = os.path.join(current_dir, "data/train")
85+
test_dir = os.path.join(current_dir, "data/test")
86+
model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
87+
88+
# Load the training and testing data
89+
x_train, y_train = get_train_data(train_dir)
90+
x_test, y_test = get_test_data(test_dir)
91+
train_ds = TensorDataset(x_train, y_train)
92+
93+
# Training parameters - used to configure the training loop
94+
batch_size = 64
95+
epochs = 1
96+
learning_rate = 0.1
97+
logger.info(
98+
"batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
99+
)
100+
101+
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
102+
103+
# Define the model, loss function and optimizer
104+
model = get_model()
105+
model = model.to(device)
106+
criterion = nn.MSELoss()
107+
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
108+
109+
# Train the model
110+
for epoch in range(epochs):
111+
for x_train_batch, y_train_batch in train_dl:
112+
y = model(x_train_batch.float())
113+
loss = criterion(y.flatten(), y_train_batch.float())
114+
optimizer.zero_grad()
115+
loss.backward()
116+
optimizer.step()
117+
epoch += 1
118+
logger.info(f"epoch: {epoch} -> loss: {loss}")
119+
120+
# Test the model
121+
with torch.no_grad():
122+
y = model(x_test.float()).flatten()
123+
mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
124+
print("\nTest MSE:", mse.numpy())
125+
126+
# Save the model
127+
os.makedirs(model_dir, exist_ok=True)
128+
torch.save(model.state_dict(), model_dir + "/model.pth")
129+
inference_code_path = model_dir + "/code/"
130+
131+
if not os.path.exists(inference_code_path):
132+
os.mkdir(inference_code_path)
133+
logger.info("Created a folder at {}!".format(inference_code_path))
134+
135+
code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
136+
shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
137+
shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
138+
logger.info("Saving models files to {}".format(inference_code_path))
139+
140+
141+
if __name__ == "__main__":
142+
print("Running the training job ...\n")
143+
144+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
145+
146+
train()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# flake8: noqa
2+
import argparse
3+
import numpy as np
4+
import os
5+
import sys
6+
import logging
7+
import json
8+
import shutil
9+
import torch
10+
import torch.nn as nn
11+
from torch.utils.data import DataLoader, TensorDataset
12+
from pytorch_model_def import get_model
13+
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.DEBUG)
17+
logger.addHandler(logging.StreamHandler(sys.stdout))
18+
current_dir = os.path.dirname(os.path.abspath(__file__))
19+
data_dir = "/opt/ml/input/data"
20+
21+
22+
def get_train_data(train_dir):
23+
"""
24+
Get the training data and convert to tensors
25+
"""
26+
27+
x_train = np.load(os.path.join(train_dir, "x_train.npy"))
28+
y_train = np.load(os.path.join(train_dir, "y_train.npy"))
29+
logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
30+
31+
return torch.from_numpy(x_train), torch.from_numpy(y_train)
32+
33+
34+
def get_test_data(test_dir):
35+
"""
36+
Get the testing data and convert to tensors
37+
"""
38+
39+
x_test = np.load(os.path.join(test_dir, "x_test.npy"))
40+
y_test = np.load(os.path.join(test_dir, "y_test.npy"))
41+
logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
42+
43+
return torch.from_numpy(x_test), torch.from_numpy(y_test)
44+
45+
46+
def model_fn(model_dir):
47+
"""
48+
Load the model for inference
49+
"""
50+
51+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52+
model = get_model()
53+
model.load_state_dict(torch.load(model_dir + "/model.pth"))
54+
model.eval()
55+
return model.to(device)
56+
57+
58+
def input_fn(request_body, request_content_type):
59+
"""
60+
Deserialize and prepare the prediction input
61+
"""
62+
63+
if request_content_type == "application/json":
64+
request = json.loads(request_body)
65+
train_inputs = torch.tensor(request)
66+
return train_inputs
67+
68+
69+
def predict_fn(input_data, model):
70+
"""
71+
Apply model to the incoming request
72+
"""
73+
74+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75+
model.to(device)
76+
model.eval()
77+
with torch.no_grad():
78+
return model(input_data.float()).numpy()[0]
79+
80+
81+
def train():
82+
"""
83+
Train the PyTorch model
84+
"""
85+
# Directories: train, test and model
86+
train_dir = os.path.join(data_dir, "train")
87+
test_dir = os.path.join(data_dir, "test")
88+
model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
89+
90+
# Load the training and testing data
91+
x_train, y_train = get_train_data(train_dir)
92+
x_test, y_test = get_test_data(test_dir)
93+
train_ds = TensorDataset(x_train, y_train)
94+
95+
# Training parameters - used to configure the training loop
96+
batch_size = 64
97+
epochs = 1
98+
learning_rate = 0.1
99+
logger.info(
100+
"batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
101+
)
102+
103+
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
104+
105+
# Define the model, loss function and optimizer
106+
model = get_model()
107+
model = model.to(device)
108+
criterion = nn.MSELoss()
109+
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
110+
111+
# Train the model
112+
for epoch in range(epochs):
113+
for x_train_batch, y_train_batch in train_dl:
114+
y = model(x_train_batch.float())
115+
loss = criterion(y.flatten(), y_train_batch.float())
116+
optimizer.zero_grad()
117+
loss.backward()
118+
optimizer.step()
119+
epoch += 1
120+
logger.info(f"epoch: {epoch} -> loss: {loss}")
121+
122+
# Test the model
123+
with torch.no_grad():
124+
y = model(x_test.float()).flatten()
125+
mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
126+
print("\nTest MSE:", mse.numpy())
127+
128+
# Save the model
129+
os.makedirs(model_dir, exist_ok=True)
130+
torch.save(model.state_dict(), model_dir + "/model.pth")
131+
inference_code_path = model_dir + "/code/"
132+
133+
if not os.path.exists(inference_code_path):
134+
os.mkdir(inference_code_path)
135+
logger.info("Created a folder at {}!".format(inference_code_path))
136+
137+
code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
138+
shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
139+
shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
140+
logger.info("Saving models files to {}".format(inference_code_path))
141+
142+
143+
if __name__ == "__main__":
144+
print("Running the training job ...\n")
145+
146+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147+
148+
train()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# flake8: noqa
2+
import torch
3+
import torch.nn as nn
4+
5+
6+
class NeuralNet(nn.Module):
7+
def __init__(self):
8+
super().__init__()
9+
self.fc1 = nn.Linear(8, 8)
10+
self.fc2 = nn.Linear(8, 6)
11+
self.fc3 = nn.Linear(6, 1)
12+
13+
def forward(self, x):
14+
x = torch.tanh(self.fc1(x))
15+
x = torch.sigmoid(self.fc2(x))
16+
x = self.fc3(x)
17+
return x
18+
19+
20+
def get_model():
21+
22+
model = NeuralNet()
23+
return model
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
numpy
2+
-f https://download.pytorch.org/whl/torch_stable.html
3+
torch==2.0.1+cpu
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# flake8: noqa
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
from peft import PeftModel
4+
import torch
5+
6+
import os
7+
import argparse
8+
9+
10+
def get_args():
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument("--base_model_name_or_path", type=str, default="bigcode/starcoderbase-7b")
13+
parser.add_argument("--peft_model_path", type=str, default="/")
14+
parser.add_argument("--push_to_hub", action="store_true", default=True)
15+
16+
return parser.parse_args()
17+
18+
19+
def main():
20+
args = get_args()
21+
22+
base_model = AutoModelForCausalLM.from_pretrained(
23+
args.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16
24+
)
25+
26+
model = PeftModel.from_pretrained(base_model, args.peft_model_path)
27+
model = model.merge_and_unload()
28+
29+
tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_or_path)
30+
31+
if args.push_to_hub:
32+
print(f"Saving to hub ...")
33+
model.push_to_hub(
34+
f"{args.base_model_name_or_path}-merged", use_temp_dir=False, private=True
35+
)
36+
tokenizer.push_to_hub(
37+
f"{args.base_model_name_or_path}-merged", use_temp_dir=False, private=True
38+
)
39+
else:
40+
model.save_pretrained(f"{args.base_model_name_or_path}-merged")
41+
tokenizer.save_pretrained(f"{args.base_model_name_or_path}-merged")
42+
print(f"Model saved to {args.base_model_name_or_path}-merged")
43+
44+
45+
if __name__ == "__main__":
46+
main()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
transformers==4.31
2+
datasets==2.16.1
3+
fsspec==2023.6.0
4+
accelerate>=0.21
5+
tokenizers>=0.13.3
6+
bitsandbytes
7+
peft

0 commit comments

Comments
 (0)