In [1]:
import argparse
import os
import copy

import torch
from torch import nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm

from models import SRCNN
from datasets import TrainDataset, EvalDataset
from utils import AverageMeter, calc_psnr

import mlflow
import numpy as np
from azureml.core import Workspace
from mlflow import MlflowClient

In [2]:
ws = Workspace.from_config()
experiment_name = "isr_cs5412"

# set up MLflow to track the metrics
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()


2022/11/30 05:33:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2022/11/30 05:33:11 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.
2022/11/30 05:33:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [3]:
torch.cuda.is_available()

True

In [4]:
train_file  = "data/91-image_x4.h5"
eval_file   = "data/Set5_x4.h5"
outputs_dir = "outputs"
scale       = 4
lr          = 1e-4
B           = 16
E           = 50
n_workers   = 3
seed        = 3


In [5]:
# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--train-file',     default="data/91-image_x4.h5",  type=str)
#     parser.add_argument('--eval-file',      default="data/Set5_x4.h5",      type=str)
#     parser.add_argument('--outputs-dir',    default="outputs",              type=str)
#     parser.add_argument('--scale',          default=3,                      type=int)
#     parser.add_argument('--lr',             default=1e-4,                   type=float)
#     parser.add_argument('--batch-size',     default=16,                     type=int)
#     parser.add_argument('--num-epochs',     default=400,                    type=int)
#     parser.add_argument('--num-workers',    default=8,                      type=int)
#     parser.add_argument('--seed',           default=114,                    type=int)
#     args = parser.parse_args()

outputs_dir = os.path.join(outputs_dir, 'x{}'.format(scale))

if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

cudnn.benchmark = True
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(seed)

model = SRCNN().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam([
    {'params': model.conv1.parameters()},
    {'params': model.conv2.parameters()},
    {'params': model.conv3.parameters(), 'lr': lr * 0.1}
], lr=lr)

train_dataset = TrainDataset(train_file)
train_dataloader = DataLoader(dataset=train_dataset,
                                batch_size=B,
                                shuffle=True,
                                num_workers=n_workers,
                                pin_memory=True,
                                drop_last=True)
eval_dataset = EvalDataset(eval_file)
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=1)


best_weights = copy.deepcopy(model.state_dict())
best_epoch = 0
best_psnr = 0.0

for epoch in range(E):
    model.train()
    epoch_losses = AverageMeter()

    with tqdm(total=(len(train_dataset) - len(train_dataset) % B)) as t:
        t.set_description('epoch: {}/{}'.format(epoch, E - 1))

        for data in train_dataloader:
            inputs, labels = data

            inputs = inputs.to(device)
            labels = labels.to(device)

            preds = model(inputs)

            loss = criterion(preds, labels)

            epoch_losses.update(loss.item(), len(inputs))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            t.set_postfix(loss='{:.6f}'.format(epoch_losses.avg))
            t.update(len(inputs))

    torch.save(model.state_dict(), os.path.join(outputs_dir, 'epoch_{}.pth'.format(epoch)))

    model.eval()
    epoch_psnr = AverageMeter()

    for data in eval_dataloader:
        inputs, labels = data

        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            preds = model(inputs).clamp(0.0, 1.0)

        epoch_psnr.update(calc_psnr(preds, labels), len(inputs))

    print('eval psnr: {:.2f}'.format(epoch_psnr.avg))

    if epoch_psnr.avg > best_psnr:
        best_epoch = epoch
        best_psnr = epoch_psnr.avg
        best_weights = copy.deepcopy(model.state_dict())

print('best epoch: {}, psnr: {:.2f}'.format(best_epoch, best_psnr))
torch.save(best_weights, os.path.join(outputs_dir, 'best.pth'))


with mlflow.start_run() as run:
    mlflow.pytorch.log_model(model, "model")

    # convert to scripted model and log the model
    scripted_pytorch_model = torch.jit.script(model)
    mlflow.pytorch.log_model(scripted_pytorch_model, "scripted_model")

# Fetch the logged model artifacts
print("run_id: {}".format(run.info.run_id))
for artifact_path in ["model/data", "scripted_model/data"]:
    artifacts = [f.path for f in MlflowClient().list_artifacts(run.info.run_id,
                artifact_path)]
    print("artifacts: {}".format(artifacts))

epoch: 0/49: 100%|██████████| 21760/21760 [00:28<00:00, 759.52it/s, loss=0.003777] 
epoch: 1/49: 100%|██████████| 21760/21760 [00:24<00:00, 880.38it/s, loss=0.002125] 
epoch: 2/49: 100%|██████████| 21760/21760 [00:14<00:00, 1469.08it/s, loss=0.002073]
epoch: 3/49: 100%|██████████| 21760/21760 [00:15<00:00, 1445.08it/s, loss=0.002045]
epoch: 4/49: 100%|██████████| 21760/21760 [00:14<00:00, 1482.87it/s, loss=0.002028]
epoch: 5/49: 100%|██████████| 21760/21760 [00:15<00:00, 1448.32it/s, loss=0.002012]
epoch: 6/49: 100%|██████████| 21760/21760 [00:14<00:00, 1462.98it/s, loss=0.002001]
epoch: 7/49: 100%|██████████| 21760/21760 [00:14<00:00, 1489.53it/s, loss=0.001991]
epoch: 8/49: 100%|██████████| 21760/21760 [00:14<00:00, 1454.03it/s, loss=0.001983]
epoch: 9/49: 100%|██████████| 21760/21760 [00:14<00:00, 1464.28it/s, loss=0.001974]
epoch: 10/49: 100%|██████████| 21760/21760 [00:14<00:00, 1479.91it/s, loss=0.001967]
epoch: 11/49: 100%|██████████| 21760/21760 [00:14<00:00, 1460.51it/s, loss=

eval psnr: 29.25
eval psnr: 29.48
eval psnr: 29.58
eval psnr: 29.64
eval psnr: 29.68
eval psnr: 29.75
eval psnr: 29.75
eval psnr: 29.83
eval psnr: 29.81
eval psnr: 29.83
eval psnr: 29.88
eval psnr: 29.90
eval psnr: 29.79
eval psnr: 29.91
eval psnr: 29.87
eval psnr: 29.91
eval psnr: 29.91
eval psnr: 29.91
eval psnr: 29.95
eval psnr: 29.69
eval psnr: 29.99
eval psnr: 29.99
eval psnr: 30.02
eval psnr: 30.02
eval psnr: 30.02
eval psnr: 30.03
eval psnr: 30.03
eval psnr: 30.02
eval psnr: 30.01
eval psnr: 29.95
eval psnr: 30.07
eval psnr: 30.06
eval psnr: 30.01
eval psnr: 30.07
eval psnr: 30.07
eval psnr: 30.08
eval psnr: 30.05
eval psnr: 30.08
eval psnr: 30.09
eval psnr: 30.08
eval psnr: 30.11
eval psnr: 30.06
eval psnr: 30.10
eval psnr: 30.11
eval psnr: 30.12
eval psnr: 30.13
eval psnr: 30.11
eval psnr: 30.10
eval psnr: 30.14
eval psnr: 30.10
best epoch: 48, psnr: 30.14
run_id: 579a43be-2ec1-457e-b4ed-b6490fd14c24
artifacts: ['model/data/model.pth', 'model/data/pickle_module_info.txt']
arti

In [6]:
run.info.run_id

'579a43be-2ec1-457e-b4ed-b6490fd14c24'

In [7]:
# register the model
model_uri = "runs:/{}/model".format(run.info.run_id)
model = mlflow.register_model(model_uri, "isr_srcnn_x4")

Successfully registered model 'isr_srcnn_x4'.
2022/11/30 05:50:55 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: isr_srcnn_x4, version 1
Created version '1' of model 'isr_srcnn_x4'.


In [9]:
model = mlflow.pytorch.load_model(model_uri=model_uri)
model

SRCNN(
  (conv1): Conv2d(1, 64, kernel_size=(9, 9), stride=(1, 1), padding=(4, 4))
  (conv2): Conv2d(64, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (conv3): Conv2d(32, 1, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (relu): ReLU(inplace=True)
)

In [18]:
import requests

import torch
import torch.backends.cudnn as cudnn
import numpy as np
import PIL.Image as pil_image

from models import SRCNN
from utils import convert_rgb_to_ycbcr, convert_ycbcr_to_rgb, calc_psnr

image_url = "https://raw.githubusercontent.com/Coloquinte/torchSR/v1.0.2/doc/example_small.png"
r = requests.get(image_url, stream=True)
r.raw

<urllib3.response.HTTPResponse at 0x7f2bbf98f670>

In [19]:
r.status_code

200

In [28]:
from PIL import Image
from io import BytesIO

image_raw = Image.open(BytesIO(r.content)).convert('RGB')
image_raw.save(f'thumbnails{os.sep}butterfly.jpg')
image_file = f'thumbnails{os.sep}butterfly.jpg'

In [29]:
image = pil_image.open(image_file).convert('RGB')
# image = Image.open(BytesIO(r.content)).convert('RGB')

image_width = (image.width // scale) * scale
image_height = (image.height // scale) * scale
image = image.resize((image_width, image_height), resample=pil_image.BICUBIC)
image = image.resize((image.width // scale, image.height // scale), resample=pil_image.BICUBIC)
image = image.resize((image.width * scale, image.height * scale), resample=pil_image.BICUBIC)
image.save(image_file.replace('.', '_bicubic_x{}.'.format(scale)))

image = np.array(image).astype(np.float32)
ycbcr = convert_rgb_to_ycbcr(image)

y = ycbcr[..., 0]
y /= 255.
y = torch.from_numpy(y).to(device)
y = y.unsqueeze(0).unsqueeze(0)

with torch.no_grad():
    preds = model(y).clamp(0.0, 1.0)

psnr = calc_psnr(y, preds)
print('PSNR: {:.2f}'.format(psnr))

preds = preds.mul(255.0).cpu().numpy().squeeze(0).squeeze(0)

output = np.array([preds, ycbcr[..., 1], ycbcr[..., 2]]).transpose([1, 2, 0])
output = np.clip(convert_ycbcr_to_rgb(output), 0.0, 255.0).astype(np.uint8)
output = pil_image.fromarray(output)
output.save(image_file.replace('.', '_srcnn_x{}.'.format(scale)))


PSNR: 28.88


## deployment configs

In [8]:
# create environment for the deploy
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.webservice import AciWebservice

# get a curated environment
env = Environment.get(
    workspace=ws, 
    # name="AzureML-pytorch-1.12.0-ubuntu18.04-py37-cpu-inference",
    name="AzureML-ACPT-pytorch-1.12-py38-cuda11.6-gpu",
    version=1
)
env.inferencing_stack_version='latest'

# create deployment config i.e. compute resources
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=8,
    memory_gb=32,
    tags={"data": "91-images_x4", "method": "srcnn_isr_x4"},
    description="Image Super Resolution with SRCNN",
)

## deploy model 

In [None]:
%%time
import uuid
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.model import Model

# get the registered model
model = Model(ws, "isr_srcnn_x4")

# create an inference config i.e. the scoring script and environment
inference_config = InferenceConfig(entry_script="score.py", environment=env)

# deploy the service
service_name = "isr-srcnn-x4-" + str(uuid.uuid4())[:4]
service = Model.deploy(
    workspace=ws,
    name=service_name,
    models=[model],
    inference_config=inference_config,
    deployment_config=aciconfig,
)

service.wait_for_deployment(show_output=True)

In [None]:
# send raw HTTP request to test the web service.
import requests

# send a random row from the test set to score
random_index = np.random.randint(0, len(X_test) - 1)
input_data = '{"data": [' + str(list(X_test[random_index])) + "]}"

headers = {"Content-Type": "application/json"}

resp = requests.post(service.scoring_uri, input_data, headers=headers)

print("POST to url", service.scoring_uri)
print("label:", y_test[random_index])
print("prediction:", resp.text)