In [145]:
!pip install sagemaker pandas scikit-learn



In [146]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
import os

In [147]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [148]:
%%writefile sagemaker_ml_script.py

import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_data(data_dir):
    data_path = os.path.join(data_dir, 'discharge.csv')
    logger.info(f"Loading data from {data_path}")
    df = pd.read_csv(data_path)
    df = df[df['Battery'] == 'B0005']
    df = df[df['Temperature_measured'] > 36]
    logger.info(f"Loaded data shape: {df.shape}")
    features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']
    X = df[features].values
    y = df['Capacity'].values
    return X, y

def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

class BatteryModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=64, num_layers=2):
        super(BatteryModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])

def train(model, train_loader, criterion, optimizer, device):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(test_loader)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default=os.environ.get('SM_CHANNEL_TRAINING', '.'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR', '.'))
    parser.add_argument('--output_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR', '.'))
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--learning-rate', type=float, default=0.001)
    parser.add_argument('--time-steps', type=int, default=10)
    args = parser.parse_args()

    logger.info("Loading and preprocessing data")
    X, y = load_data(args.data_dir)
    
    # Create sequences for LSTM
    X_seq, y_seq = create_sequences(X, y, time_steps=args.time_steps)
    logger.info(f"Sequence data shape: X: {X_seq.shape}, y: {y_seq.shape}")

    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

    X_train = torch.FloatTensor(X_train)
    y_train = torch.FloatTensor(y_train).unsqueeze(1)
    X_test = torch.FloatTensor(X_test)
    y_test = torch.FloatTensor(y_test).unsqueeze(1)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size)

    logger.info("Setting up model, criterion, and optimizer")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BatteryModel(input_size=5, hidden_size=64, num_layers=2).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    logger.info("Starting training")
    for epoch in range(args.epochs):
        train(model, train_loader, criterion, optimizer, device)
        test_loss = evaluate(model, test_loader, criterion, device)
        logger.info(f'Epoch {epoch+1}/{args.epochs}, Test Loss: {test_loss:.4f}')

    logger.info("Saving model")
    torch.save(model.state_dict(), os.path.join(args.model_dir, 'model.pth'))
    logger.info('Training complete. Model saved.')

def model_fn(model_dir):
    logger.info(f"Loading model from {model_dir}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BatteryModel(input_size=5, hidden_size=64, num_layers=2)
    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f, map_location=device))
    return model.to(device)

def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        return np.array(input_data, dtype=np.float32).reshape(1, 10, 5)
    elif request_content_type == 'application/x-npy':
        # Assuming the first 132 elements are the actual data
        data = np.frombuffer(request_body, dtype=np.float32)[:132]
        return data.reshape(1, 10, 5)
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    model.eval()
    with torch.no_grad():
        # Convert numpy array to PyTorch tensor
        input_tensor = torch.from_numpy(input_data).float()
        output = model(input_tensor)
    return output.numpy()

def output_fn(prediction, accept='application/json'):
    logger.info(f"Returning output with accept type: {accept}")
    if accept == 'application/json':
        return json.dumps(prediction.tolist())
    raise ValueError(f"Unsupported accept type: {accept}")

Overwriting sagemaker_ml_script.py


In [149]:
# Create a new directory and copy the script
import shutil
os.makedirs('sagemaker_code', exist_ok=True)
shutil.copy2('sagemaker_ml_script.py', 'sagemaker_code/')

'sagemaker_code/sagemaker_ml_script.py'

In [151]:
estimator = PyTorch(
    entry_point='sagemaker_ml_script.py',
    role=role,
    framework_version='1.8.0',
    py_version='py3',
    instance_count=1,
    instance_type='ml.m5.large',
    hyperparameters={
        'epochs': 100,
        'batch-size': 32,
        'learning-rate': 0.001
    },
    source_dir='sagemaker_code'
)

In [152]:
# Train the model
data_location = 's3://digitaltwinbattery/discharge.csv'
estimator.fit({'training': data_location})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-09-06-11-39-49-017


2024-09-06 11:39:53 Starting - Starting the training job...
2024-09-06 11:40:08 Starting - Preparing the instances for training...
2024-09-06 11:40:32 Downloading - Downloading input data...
2024-09-06 11:41:18 Downloading - Downloading the training image...
2024-09-06 11:41:53 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-09-06 11:41:57,721 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-09-06 11:41:57,723 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-09-06 11:41:57,734 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-09-06 11:41:57,736 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-09-06 11:41:57,891 sagemaker-training-to

In [153]:
# Deploy the model
predictor = estimator.deploy(instance_type='ml.m5.large', initial_instance_count=1)
endpoint_name = predictor.endpoint_name
print(f"Model deployed. Endpoint name: {endpoint_name}")

INFO:sagemaker:Repacking model artifact (s3://sagemaker-eu-north-1-522814719181/pytorch-training-2024-09-06-11-39-49-017/output/model.tar.gz), script artifact (s3://sagemaker-eu-north-1-522814719181/pytorch-training-2024-09-06-11-39-49-017/source/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-eu-north-1-522814719181/pytorch-training-2024-09-06-11-48-21-242/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-training-2024-09-06-11-48-21-242
INFO:sagemaker:Creating endpoint-config with name pytorch-training-2024-09-06-11-48-21-242
INFO:sagemaker:Creating endpoint with name pytorch-training-2024-09-06-11-48-21-242


------!Model deployed. Endpoint name: pytorch-training-2024-09-06-11-48-21-242
