# このipynbファイルの用途

## 対象者
　sagemakerとpytorchを組み合わせて使ってみたいユーザー

##  できること
    - sagemakerAPIとpytorchの組み合わせによる簡易訓練、デプロイの流れを理解できる
    - カスタマイズする箇所がわかりやすいのでカスタムモデルの構築にスムーズに移行できる

## 関連ファイル
    - feature_extract_cifar10.py

## 検証日
    - 2023-12-13

## ディレクトリ構成
    - [初心者向け]Amazon SageMakerでPyTorch.ipynb
    - feature_extract_cifar10.py


In [1]:
## 必要モデルのinstall
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

### 補足: sagemaker_trainingライブラリは今回は使用しない

In [2]:
%%time

## 環境変数の設定 

import sagemaker
import os
import boto3
import re
import numpy as np

sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

bucket='mlearning-bucket'
prefix = 'sagemaker/cnn-cifar10'
# customize to your bucket where you have stored the data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

### 補足: mlearning-bucketの箇所は自分の使用するsagemaker用のバケット名を指定する

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
CPU times: user 1.18 s, sys: 105 ms, total: 1.29 s
Wall time: 1.4 s


In [3]:
%%time

## データセットのtransform設定
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

train_data = datasets.CIFAR10(root='../data', train=True, download=True, transform=transform)
test_data = datasets.CIFAR10(root='../data', train=False, download=True, transform=transform)

### 補足: torchvisionのdatasetsからデータのダウンロード

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13549803.20it/s]


Extracting ../data/cifar-10-python.tar.gz to ../data
Files already downloaded and verified
CPU times: user 2.8 s, sys: 602 ms, total: 3.4 s
Wall time: 17.1 s


In [4]:
%%time

## s3へのデータのアップロード
inputs = sagemaker_session.upload_data(path='../data', bucket=bucket, key_prefix=prefix)
print('input spec (in this case, just an S3 path): {}'.format(inputs))

### 補足: s3のsagemaker/cnn-cifar10ディレクトリにアップロード

input spec (in this case, just an S3 path): s3://mlearning-bucket/sagemaker/cnn-cifar10
CPU times: user 5.43 s, sys: 3.7 s, total: 9.13 s
Wall time: 7.32 s


In [6]:
## sagemakerのpytorchモデルを用いた予測クラスの生成
from sagemaker.pytorch import PyTorch

hyper_param = {
    'epochs':100,
    'batch-size': 100,
    'lr': 0.01,
    'momentum': 0.9,
}

estimator = PyTorch(entry_point='feature_extract_cifar10.py',
                            hyperparameters=hyper_param,
                            role=role,
                            framework_version='1.2.0',
                            py_version='py3',
                            train_instance_count=2,
                            train_instance_type='ml.c5.xlarge')

print(f'トレーニングに使用するコンテナイメージは {estimator.training_image_uri()} です')
### 補足: sagemakerのライブラリのアップデートに伴い、以下二つのオプションの設定が必要となったので、設定。;framework_version='1.2.0', py_version='py3'

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [None]:
## 訓練の実行
estimator.fit({'training': inputs}, logs=True)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-12-13-09-28-13-112


2023-12-13 09:28:14 Starting - Starting the training job......
2023-12-13 09:29:14 Starting - Preparing the instances for training.........
2023-12-13 09:30:42 Downloading - Downloading input data.....[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2023-12-13 09:31:24,485 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2023-12-13 09:31:24,488 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2023-12-13 09:31:24,497 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35m2023-12-13 09:31:24,498 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[35m2023-12-13 09:31:24,668 sagemaker-containers INFO     Module feature_extract_cifar10 does not provide a setup.py. [0m
[35mGenerating setup.py[0m
[35m2023-12-13 09:31:24,669 sagemaker-containers INFO  

In [None]:
## モデルのデプロイ
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
## モデルの検証
import numpy as np
correct = 0
total = 0


## テストのための設定
test_loader = DataLoader(test_data, batch_size=100, shuffle=False)

with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = predictor.predict(images.numpy())
        _, predicted = torch.max(torch.from_numpy(np.array(outputs)), 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))
### 補足2：feature_extract_cifarの中身がs3のデータを使用する形になっていないので、s3の該当のデータをとってくるよう修正が必要


In [None]:
## モデルの後片付け
endpointName = "<<ここに上記で作り、返却されているsagemakerのendpoint名を記載>>"

sagemaker_client = boto3.client('sagemaker')
response = sagemaker_client.delete_endpoint(
    EndpointName=endpointName
)

## 補足：boto3はawsのコンテナであればデフォルトでinstallされている可能性が高い