In [1]:
!nvidia-smi

Thu Jun 15 02:31:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import numpy as np
from utils import download_list, drive_download
%matplotlib inline

In [3]:
download_list(49, "npz")

Downloading k49-train-imgs.npz - 64.6 MB


100%|██████████| 64569/64569 [00:46<00:00, 1389.75KB/s]


Downloading k49-train-labels.npz - 0.2 MB


100%|██████████| 161/161 [00:00<00:00, 242.09KB/s]


Downloading k49-test-imgs.npz - 10.7 MB


100%|██████████| 10715/10715 [00:14<00:00, 740.64KB/s]


Downloading k49-test-labels.npz - 0.0 MB


100%|██████████| 27/27 [00:00<00:00, 122.23KB/s]

All dataset files downloaded!





In [2]:
train_images = np.load('./k49-train-imgs.npz')['arr_0']
test_images = np.load('./k49-test-imgs.npz')['arr_0']
train_labels = np.load('./k49-train-labels.npz')['arr_0']
test_labels = np.load('./k49-test-labels.npz')['arr_0']

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class JPDDataset(Dataset):

    def __init__(self, images, labels):
        # self._process_images(images)
        self.images = torch.from_numpy(images / 255.0)
        self.labels = torch.from_numpy(labels)
        # self.transform = transforms.Compose([
        #     transforms.Normalize((0.5), (0.5))
        # ])

    def __len__(self):
        return self.images.size(0)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = torch.zeros((49))
        label[int(self.labels[idx])] = 1.0
        # image = self.transform(image)
        return image.float().unsqueeze(0), label

In [4]:
def get_loader(train_images, train_label, test_images, test_labels, batch_size):
    train_data = JPDDataset(train_images, train_labels)
    valid_data = JPDDataset(test_images, test_labels)
    print(f"Train: {len(train_data)} samples, Valid: {len(valid_data)} samples")
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
    print(f"Train: {len(train_loader)} batches, Valid: {len(valid_loader)} batches")
    return train_loader, valid_loader

In [5]:
train_loader, valid_loader = get_loader(train_images, train_labels, test_images, test_labels, 2048)

Train: 232365 samples, Valid: 38547 samples
Train: 114 batches, Valid: 19 batches


In [6]:
for idx, (X, y) in enumerate(train_loader):
    print(X.size(), y.size())
    break

torch.Size([2048, 1, 28, 28]) torch.Size([2048, 49])


In [7]:
#@title
import os
import sys
import time
import json
import random
import warnings
from datetime import timedelta
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


class Trainer:

    def __init__(
        self,
        model,
        optimizer,
        criterion,
        scheduler=None
    ):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.model = self.model.to(self.device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.scheduler = scheduler
        self.threshold = 0.65
        self.cache = {
            "train_loss": [],
            "valid_loss": [],
            "train_acc": [],
            "valid_acc": [],
            "lr": [],
        }
    #

    def load_checkpoint(self, path):
        params = torch.load(path)
        self.model.load_state_dict(params["model"])
        self.optimizer.load_state_dict(params["optimizer"])
        if self.scheduler is not None:
            self.scheduler.load_state_dict(params["scheduler"])
        self.cache = params["cache"]
        print("[+] Model load successful")
    #

    def save_checkpoint(self, path):
        params = {
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "scheduler": None if self.scheduler is None else self.scheduler.state_dict(),
            "cache": self.cache
        }
        torch.save(params, path)
        print("[+] Save checkpoint successfully")
    #

    def compute_metrics(self, preds, labels):
        labels = labels.cpu().detach().numpy()
        preds = preds.cpu().detach()
        if preds.ndim == 1:
            preds = preds.unsqueeze(0)
        preds = torch.sigmoid(preds).numpy()
        preds[preds >= self.threshold] = 1.0
        preds[preds < 1.0] = 0
        accuracy = accuracy_score(labels, preds)
        precision = precision_score(labels, preds, average='weighted', zero_division=1)
        recall = recall_score(labels, preds, average='weighted', zero_division=1)
        f1 = f1_score(labels, preds, average='weighted', zero_division=1)
        return {
            'accuracy': round(accuracy, 3),
            'precision': round(precision, 3),
            'recall': round(recall, 3),
            'f1': round(f1, 3),
        }
    #

    def forward(self, dataloader, train_mode="Train"):
        if train_mode == "Train":
            self.model.train()
        else:
            self.model.eval()
        loss_his = []
        acc_his = []
        N = len(dataloader)
        self.cache_valid = {
            "logits": [],
            "labels": []
        }
        for idx, (images, labels) in enumerate(dataloader, 1):
            if train_mode == "Train":
                self.optimizer.zero_grad()
            images = images.to(self.device)
            labels = labels.to(self.device)
            if train_mode == "Train":
                logits = self.model(images)
                loss = self.criterion(logits, labels)
                loss.backward()
                self.optimizer.step()
                if self.scheduler is not None:
                    self.scheduler.step()
            else:
                with torch.no_grad():
                    logits = self.model(images)
                    loss = self.criterion(logits, labels)
            acc = self.compute_metrics(logits, labels)
            loss_his.append(loss.item())
            acc_his.append(acc)

            log_info = {
                train_mode: f" batch: {idx} / {N}",
                "loss": f": {round(loss_his[-1], 5)}",
                "accuracy": f": {acc['accuracy']}",
                "precision": f": {acc['precision']}",
                "recall": f": {acc['recall']}",
                "f1": f": {acc['f1']}"
            }
            log_info = [str(k) + str(v) for k, v in log_info.items()]
            log_info = " - ".join(log_info)
            print("\r", end="")
            print(log_info, end="" if idx != N else "\n")

        loss_his = sum(loss_his) / len(loss_his)
        acc_his = {
            "accuracy": sum([i['accuracy'] for i in acc_his]) / len(acc_his),
            "precision": sum([i['precision'] for i in acc_his]) / len(acc_his),
            "recall": sum([i['recall'] for i in acc_his]) / len(acc_his),
            "f1": sum([i['f1'] for i in acc_his]) / len(acc_his),
        }
        acc_his = {
            k : round(float(v), 3) for k , v in acc_his.items()
        }
        if train_mode == "Train":
            self.cache["train_loss"].append(loss_his)
            self.cache["train_acc"].append(acc_his)
        else:
            self.cache["valid_loss"].append(loss_his)
            self.cache["valid_acc"].append(acc_his)
    #

    def fit(
        self,
        train_loader,
        valid_loader=None,
        epochs=10,
        checkpoint="./checkpoint.pt"
    ):
        print(f"Running on: {torch.cuda.get_device_name(torch.cuda.current_device())}")
        print(f"Total update step: {len(train_loader) * epochs}")

        for epoch in range(1, epochs+1):
            start_time = time.time()
            print(f"Epoch: {epoch}")
            logs = []
            current_lr = f": {self.optimizer.param_groups[0]['lr']:.5}"
            try:
                self.forward(train_loader)
                train_loss = round(self.cache["train_loss"][-1], 5)
                train_acc = [str(k) + ": " + str(v) for k, v in self.cache["train_acc"][-1].items()]
                train_acc = " - ".join(train_acc)
                logs.append(f"\t=> Train epoch: loss: {train_loss} - {train_acc}")
            except KeyboardInterrupt:
                sys.exit()
            if valid_loader is not None:
                try:
                    self.forward(valid_loader, "Valid")
                    valid_loss = round(self.cache["valid_loss"][-1], 5)
                    valid_acc = [str(k) + ": " + str(v) for k, v in self.cache["valid_acc"][-1].items()]
                    valid_acc = " - ".join(valid_acc)
                    logs.append(f"\t=> Valid epoch: loss: {valid_loss} - {valid_acc}")
                except KeyboardInterrupt:
                    sys.exit()
            total_time = round(time.time() - start_time, 1)
            logs.append(f"\t=> Learning Rate: {current_lr} - Time: {timedelta(seconds=int(total_time))}/step\n")
            print("\n".join(logs))
            self.cache["lr"].append(current_lr)
            self.save_checkpoint(checkpoint)

In [14]:
import torchvision
def create_model(input_dim, num_classes=49):
    model = torchvision.models.resnet18(pretrained=True)
    model.conv1 = torch.nn.Conv2d(input_dim, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    model.fc = torch.nn.Linear(in_features=512, out_features=num_classes, bias=True)
    return model

In [15]:
model = create_model(1)
# model



In [17]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(
    params=model.parameters(),
    lr=1e-3,
    betas=(0.9, 0.999),
    eps=1e-08,
)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_loader)*2, gamma=0.5)

In [18]:
trainer = Trainer(model, optimizer, criterion, scheduler)
# trainer.load_checkpoint()

In [19]:
trainer.fit(train_loader, valid_loader, 20, "./checkpoint/model_v1.pt")

Running on: Tesla T4
Total update step: 2280
Epoch: 1
Train batch: 114 / 114 - loss: 0.01293 - accuracy: 0.825 - precision: 0.954 - recall: 0.825 - f1: 0.879
Valid batch: 19 / 19 - loss: 0.02516 - accuracy: 0.678 - precision: 0.912 - recall: 0.679 - f1: 0.769
	=> Train epoch: loss: 0.06726 - accuracy: 0.426 - precision: 0.916 - recall: 0.431 - f1: 0.485
	=> Valid epoch: loss: 0.0245 - accuracy: 0.684 - precision: 0.919 - recall: 0.685 - f1: 0.773
	=> Learning Rate: : 0.001 - Time: 0:00:53/step

[+] Save checkpoint successfully
Epoch: 2
Train batch: 114 / 114 - loss: 0.00744 - accuracy: 0.917 - precision: 0.967 - recall: 0.917 - f1: 0.939
Valid batch: 19 / 19 - loss: 0.01226 - accuracy: 0.846 - precision: 0.955 - recall: 0.847 - f1: 0.894
	=> Train epoch: loss: 0.00783 - accuracy: 0.902 - precision: 0.972 - recall: 0.902 - f1: 0.934
	=> Valid epoch: loss: 0.0117 - accuracy: 0.854 - precision: 0.959 - recall: 0.854 - f1: 0.901
	=> Learning Rate: : 0.001 - Time: 0:00:52/step

[+] Save che

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "<ipython-input-7-460b716a5707>", line 157, in fit
    self.forward(train_loader)
  File "<ipython-input-7-460b716a5707>", line 106, in forward
    acc = self.compute_metrics(logits, labels)
  File "<ipython-input-7-460b716a5707>", line 59, in compute_metrics
    labels = labels.cpu().detach().numpy()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-149afa5f960b>", line 1, in <cell line: 1>
    trainer.fit(train_loader, valid_loader, 20, "./checkpoint/model_v1.pt")
  File "<ipython-input-7-460b716a5707>", line 163, in fit
    sys.exit()
SystemExit

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-

TypeError: ignored