In [1]:
import argparse
import os
import copy

import torch
from torch import nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm

from models import SRCNN
from datasets import TrainDataset, EvalDataset
from utils import AverageMeter, calc_psnr

import mlflow
import numpy as np
from azureml.core import Workspace
from mlflow import MlflowClient

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ws = Workspace.from_config()
experiment_name = "isr_cs5412"

# set up MLflow to track the metrics
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()


In [3]:
train_file = "data/91-image_x4.h5"
eval_file = "data/Set5_x4.h5"
outputs_dir = "outputs"
scale = 3
lr = 1e-4
B = 16
E = 150
n_workers = 8
seed = 114


In [4]:
# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--train-file',     default="data/91-image_x4.h5",  type=str)
#     parser.add_argument('--eval-file',      default="data/Set5_x4.h5",      type=str)
#     parser.add_argument('--outputs-dir',    default="outputs",              type=str)
#     parser.add_argument('--scale',          default=3,                      type=int)
#     parser.add_argument('--lr',             default=1e-4,                   type=float)
#     parser.add_argument('--batch-size',     default=16,                     type=int)
#     parser.add_argument('--num-epochs',     default=400,                    type=int)
#     parser.add_argument('--num-workers',    default=8,                      type=int)
#     parser.add_argument('--seed',           default=114,                    type=int)
#     args = parser.parse_args()

outputs_dir = os.path.join(outputs_dir, 'x{}'.format(scale))

if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

cudnn.benchmark = True
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(seed)

model = SRCNN().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam([
    {'params': model.conv1.parameters()},
    {'params': model.conv2.parameters()},
    {'params': model.conv3.parameters(), 'lr': lr * 0.1}
], lr=lr)

train_dataset = TrainDataset(train_file)
train_dataloader = DataLoader(dataset=train_dataset,
                                batch_size=B,
                                shuffle=True,
                                num_workers=n_workers,
                                pin_memory=True,
                                drop_last=True)
eval_dataset = EvalDataset(eval_file)
eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=1)


best_weights = copy.deepcopy(model.state_dict())
best_epoch = 0
best_psnr = 0.0

for epoch in range(E):
    model.train()
    epoch_losses = AverageMeter()

    with tqdm(total=(len(train_dataset) - len(train_dataset) % B)) as t:
        t.set_description('epoch: {}/{}'.format(epoch, E - 1))

        for data in train_dataloader:
            inputs, labels = data

            inputs = inputs.to(device)
            labels = labels.to(device)

            preds = model(inputs)

            loss = criterion(preds, labels)

            epoch_losses.update(loss.item(), len(inputs))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            t.set_postfix(loss='{:.6f}'.format(epoch_losses.avg))
            t.update(len(inputs))

    torch.save(model.state_dict(), os.path.join(outputs_dir, 'epoch_{}.pth'.format(epoch)))

    model.eval()
    epoch_psnr = AverageMeter()

    for data in eval_dataloader:
        inputs, labels = data

        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            preds = model(inputs).clamp(0.0, 1.0)

        epoch_psnr.update(calc_psnr(preds, labels), len(inputs))

    print('eval psnr: {:.2f}'.format(epoch_psnr.avg))

    if epoch_psnr.avg > best_psnr:
        best_epoch = epoch
        best_psnr = epoch_psnr.avg
        best_weights = copy.deepcopy(model.state_dict())

print('best epoch: {}, psnr: {:.2f}'.format(best_epoch, best_psnr))
torch.save(best_weights, os.path.join(outputs_dir, 'best.pth'))


with mlflow.start_run() as run:
    mlflow.pytorch.log_model(model, "model")

    # convert to scripted model and log the model
    scripted_pytorch_model = torch.jit.script(model)
    mlflow.pytorch.log_model(scripted_pytorch_model, "scripted_model")

# Fetch the logged model artifacts
print("run_id: {}".format(run.info.run_id))
for artifact_path in ["model/data", "scripted_model/data"]:
    artifacts = [f.path for f in MlflowClient().list_artifacts(run.info.run_id,
                artifact_path)]
    print("artifacts: {}".format(artifacts))

epoch: 0/149:   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | 0/21760 [00:04<?, ?it/s]


RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.

import torch
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([16, 1, 33, 33], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(1, 64, kernel_size=[9, 9], padding=[4, 4], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()

ConvolutionParams 
    memory_format = Contiguous
    data_type = CUDNN_DATA_FLOAT
    padding = [4, 4, 0]
    stride = [1, 1, 0]
    dilation = [1, 1, 0]
    groups = 1
    deterministic = false
    allow_tf32 = true
input: TensorDescriptor 0x559502cd91f0
    type = CUDNN_DATA_FLOAT
    nbDims = 4
    dimA = 16, 1, 33, 33, 
    strideA = 1089, 1089, 33, 1, 
output: TensorDescriptor 0x5595025fbdd0
    type = CUDNN_DATA_FLOAT
    nbDims = 4
    dimA = 16, 64, 33, 33, 
    strideA = 69696, 1089, 33, 1, 
weight: FilterDescriptor 0x55950316b6b0
    type = CUDNN_DATA_FLOAT
    tensor_format = CUDNN_TENSOR_NCHW
    nbDims = 4
    dimA = 64, 1, 9, 9, 
Pointer addresses: 
    input: 0x1101050c00
    output: 0x11011e0000
    weight: 0x1100fe0000


In [5]:
run.info.run_id

NameError: name 'run' is not defined

In [None]:
# register the model
model_uri = "runs:/{}/model".format(run.info.run_id)
model = mlflow.register_model(model_uri, "isr_srcnn_x4")

## deployment configs

In [None]:
# create environment for the deploy
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.webservice import AciWebservice

# get a curated environment
env = Environment.get(
    workspace=ws, 
    name="AzureML-pytorch-1.12.0-ubuntu18.04-py37-cpu-inference",
    version=1
)
env.inferencing_stack_version='latest'

# create deployment config i.e. compute resources
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=8,
    memory_gb=32,
    tags={"data": "91-images_x4", "method": "srcnn_isr_x4"},
    description="Image Super Resolution with SRCNN",
)

## deploy model 

In [None]:
%%time
import uuid
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.model import Model

# get the registered model
model = Model(ws, "isr_srcnn_x4")

# create an inference config i.e. the scoring script and environment
inference_config = InferenceConfig(entry_script="model_inference.py", environment=env)

# deploy the service
service_name = "isr-srcnn-x4-" + str(uuid.uuid4())[:4]
service = Model.deploy(
    workspace=ws,
    name=service_name,
    models=[model],
    inference_config=inference_config,
    deployment_config=aciconfig,
)

service.wait_for_deployment(show_output=True)

In [None]:
# send raw HTTP request to test the web service.
import requests

# send a random row from the test set to score
random_index = np.random.randint(0, len(X_test) - 1)
input_data = '{"data": [' + str(list(X_test[random_index])) + "]}"

headers = {"Content-Type": "application/json"}

resp = requests.post(service.scoring_uri, input_data, headers=headers)

print("POST to url", service.scoring_uri)
print("label:", y_test[random_index])
print("prediction:", resp.text)