In [1]:
!pip install sagemaker torch torchvision



In [2]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
# Initialize SageMaker session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [31]:
mkdir source_dir

In [66]:
%%writefile source_dir/train.py

import argparse
import os
import io
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import mlflow
import mlflow.pytorch

class LegoDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

class LegoCNN(nn.Module):
    def __init__(self):
        super(LegoCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 64 * 8 * 8)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x
        
def calculate_accuracy(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

def train(args):

    if args.mlflow_tracking_arn:
        mlflow.set_tracking_uri(args.mlflow_tracking_arn)

    mlflow.set_experiment("Lego-Quality-Classification-SageMaker")  

    with mlflow.start_run():
        # Log parameters
        mlflow.log_params({
            "epochs": args.epochs,
            "batch_size": args.batch_size,
            "learning_rate": args.learning_rate,
            "optimizer": "Adam"
        })
    
        # Set up data transforms
        transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    
        # SageMaker paths
        train_data_dir = args.train
        good_dir = os.path.join(train_data_dir, 'Good')
        defective_dir = os.path.join(train_data_dir, 'Defective')
    
        print(f"Looking for data in: {train_data_dir}")
        print(f"Good directory: {good_dir}")
        print(f"Defective directory: {defective_dir}")
    
        # Load data
        good_images = [os.path.join(good_dir, f) for f in os.listdir(good_dir) 
                      if f.endswith(('.png', '.jpg', '.jpeg'))]
        defective_images = [os.path.join(defective_dir, f) for f in os.listdir(defective_dir) 
                           if f.endswith(('.png', '.jpg', '.jpeg'))]
        
        print(f"Found {len(good_images)} good images and {len(defective_images)} defective images")
        
        all_images = good_images + defective_images
        all_labels = [0] * len(good_images) + [1] * len(defective_images)
    
        # Create datasets
        dataset = LegoDataset(all_images, all_labels, transform=transform)
        train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    
        # Initialize model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if device.type == 'cpu':
            # Only set thread count for CPU training
            num_threads = 2  # or adjust based on instance type
            torch.set_num_threads(num_threads)
            print(f"Set number of CPU threads to: {num_threads}")
        model = LegoCNN().to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    
        # Training loop
        for epoch in range(args.epochs):
            model.train()
            running_loss = 0.0
            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device).float().unsqueeze(1)
                
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()

                epoch_loss = running_loss / len(train_loader)
                mlflow.log_metric("epoch_loss", epoch_loss, step=epoch)

            accuracy = calculate_accuracy(model, train_loader, device)
            mlflow.log_metric("accuracy", accuracy, step=epoch)
            
            print(f'Epoch [{epoch+1}/{args.epochs}], Loss: {running_loss/len(train_loader):.4f}')
    
        # Save the model
        model_path = os.path.join(args.model_dir, 'model.pth')
        torch.save(model.state_dict(), model_path)

        # Log the model to MLflow
        mlflow.pytorch.log_model(model, "model")

        # Log model summary
        model_summary = str(model)
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # SageMaker parameters
    parser.add_argument('--mlflow-tracking-arn', type=str)
    parser.add_argument('--region', type=str, default='us-east-1') 
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    
    # Training parameters
    parser.add_argument('--epochs', type=int, default=15)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--learning-rate', type=float, default=0.001)
    
    args = parser.parse_args()
    train(args)

def model_fn(model_dir):
    """Load the PyTorch model from the `model_dir` directory."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LegoCNN()
    
    # Load model state
    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f, map_location=device))
    
    model.to(device)
    model.eval()
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == 'application/x-image':
        image = Image.open(io.BytesIO(request_body)).convert('RGB')
        return preprocess_image(image)
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    input_data = input_data.to(device)
    
    with torch.no_grad():
        output = model(input_data)
        probability = torch.sigmoid(output).item()
        prediction = "Good" if probability < 0.5 else "Defective"
    
    return {"prediction": prediction, "probability": probability}

def output_fn(prediction, accept):
    if accept == 'application/json':
        return json.dumps(prediction), accept
    raise ValueError(f"Unsupported accept type: {accept}")

def preprocess_image(image):
    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    return transform(image).unsqueeze(0)

Overwriting source_dir/train.py


In [57]:
%%writefile source_dir/requirements.txt
mlflow==2.13.2
sagemaker-mlflow==0.1.0

Overwriting source_dir/requirements.txt


In [5]:
# Upload training data to S3
bucket = sagemaker_session.default_bucket()
prefix = 'lego-classification'

# Upload your local images to S3
train_data_path = 'Images'
s3_train_data = sagemaker_session.upload_data(
    path=train_data_path,
    bucket=bucket,
    key_prefix=os.path.join(prefix, 'train')
)

In [55]:
mlflow_tracking_arn = 'arn:aws:sagemaker:us-east-1:316413003582:mlflow-tracking-server/sample-mlflow-tracking'

# Create PyTorch estimator
pytorch_estimator = PyTorch(
    entry_point='train.py',
    role=role,
    framework_version='1.12.0',  
    py_version='py38',          
    instance_count=1,
    instance_type='ml.m5.xlarge',
    hyperparameters={
        'epochs': 15,
        'batch-size': 16,
        'learning-rate': 0.001,
        'mlflow-tracking-arn': mlflow_tracking_arn,
        'region': 'us-east-1'
    },
    source_dir='source_dir'
)

In [48]:
s3_train_data

's3://sagemaker-us-east-1-316413003582/lego-classification/train'

In [67]:
# Start training
pytorch_estimator.fit({'train': s3_train_data})

2024-12-19 07:12:24 Starting - Starting the training job...
2024-12-19 07:12:39 Starting - Preparing the instances for training...
2024-12-19 07:13:15 Downloading - Downloading the training image......
2024-12-19 07:14:11 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-12-19 07:14:16,731 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-12-19 07:14:16,733 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-19 07:14:16,741 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-12-19 07:14:16,743 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-12-19 07:14:16,925 sagemaker-training-toolkit INFO     Installing dependencies from requirements.tx

In [68]:
predictor = pytorch_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

-------!

In [73]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2


In [74]:
import boto3
import json
import mlflow
from PIL import Image
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Set up MLflow tracking
arn = 'arn:aws:sagemaker:us-east-1:316413003582:mlflow-tracking-server/sample-mlflow-tracking'

mlflow.set_tracking_uri(arn)
mlflow.set_experiment("Lego-Quality-Endpoint-Evaluation")

runtime = boto3.client('runtime.sagemaker')

test_images = [  
    ('Images/Good/image_0d06e4a3-c6a6-4130-9e6f-363e44831dcc.jpg', 'Good'),    
    ('Images/Good/image_23120a2b-0ead-408a-824a-bc909e9de2b9.jpg', 'Good'),
    ('Images/Defective/0-change lego block to purple.png', 'Defective'),
    ('Images/Defective/1-change lego block to light green.png', 'Defective')
]

with mlflow.start_run(run_name="endpoint-evaluation"):
    # Lists to store results
    true_labels = []
    predicted_labels = []
    probabilities = []
    
    # Log endpoint details
    mlflow.log_param("endpoint_name", predictor.endpoint_name)
    mlflow.log_param("num_test_images", len(test_images))
    
    # Process each image
    for img_path, true_label in test_images:
        with open(img_path, 'rb') as f:
            payload = f.read()
        
        response = runtime.invoke_endpoint(
            EndpointName=predictor.endpoint_name,
            ContentType='application/x-image',
            Body=payload
        )
        
        result = json.loads(response['Body'].read().decode())
        print(f"Image: {img_path}")
        print(f"True label: {true_label}")
        print(f"Prediction: {result}")
        print("-------------------")
        
        # Store results
        true_labels.append(1 if true_label == 'Defective' else 0)
        predicted_labels.append(1 if result['prediction'] == 'Defective' else 0)
        probabilities.append(result['probability'])
        
        # Log individual prediction
        mlflow.log_metrics({
            f"prediction_{len(predicted_labels)}_probability": result['probability'],
            f"prediction_{len(predicted_labels)}_correct": 
                1 if (result['prediction'] == true_label) else 0
        })
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)
    
    # Log overall metrics
    mlflow.log_metrics({
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "avg_probability": np.mean(probabilities)
    })
    
    # Log confusion matrix as a figure
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', 
                xticklabels=['Good', 'Defective'],
                yticklabels=['Good', 'Defective'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Save and log the confusion matrix plot
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()
    
    # Print summary
    print("\nEvaluation Summary:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

# Optional: Create a detailed results DataFrame
import pandas as pd

results_df = pd.DataFrame({
    'Image': [img[0] for img in test_images],
    'True_Label': [img[1] for img in test_images],
    'Predicted_Label': [pred['prediction'] for pred in results],
    'Probability': [pred['probability'] for pred in results],
    'Correct': [img[1] == pred['prediction'] for img, pred in zip(test_images, results)]
})

# Save and log the results table
results_df.to_csv('prediction_results.csv', index=False)
mlflow.log_artifact('prediction_results.csv')

print("\nDetailed Results:")
print(results_df)


Image: Images/Good/image_0d06e4a3-c6a6-4130-9e6f-363e44831dcc.jpg
True label: Good
Prediction: {'prediction': 'Good', 'probability': 0.0001365438220091164}
-------------------
Image: Images/Good/image_23120a2b-0ead-408a-824a-bc909e9de2b9.jpg
True label: Good
Prediction: {'prediction': 'Good', 'probability': 0.0006760513060726225}
-------------------
Image: Images/Defective/0-change lego block to purple.png
True label: Defective
Prediction: {'prediction': 'Defective', 'probability': 0.9958510398864746}
-------------------
Image: Images/Defective/1-change lego block to light green.png
True label: Defective
Prediction: {'prediction': 'Defective', 'probability': 0.9440397620201111}
-------------------

Evaluation Summary:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Confusion Matrix:
[[2 0]
 [0 2]]
