# Import libraries

In [1]:
# numpy, scipy, pandas, sklearn, matplotlib
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score,RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# pytorch and pytorch lightning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision 
from torchvision import datasets
import torchvision.transforms as transforms
!pip install torchsummary
from torchsummary import summary
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip install --quiet pytorch-lightning>=1.4
    import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# others
import os
from tqdm import tqdm_notebook as tqdm
import time
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0m

In [2]:
# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)
# print("Number of workers:", NUM_WORKERS)

Device: cuda:0


In [3]:
# global constants
IMAGE_SIZE = (224, 224)
NUM_CLASSES = 3
BATCH_SIZE = 32

# Import data

In [4]:
# dataset path
train_path = '../input/covid19-image-dataset/Covid19-dataset/train'
val_path = '../input/covid19-image-dataset/Covid19-dataset/test'
deploy_path = '../input/covid19radiographydatabaseedited/COVID-19_Radiography_Dataset'

# import data
detect_transform = transforms.Compose([transforms.Resize(IMAGE_SIZE), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])])
trainset = datasets.ImageFolder(train_path,transform = detect_transform)
valset = datasets.ImageFolder(val_path,transform = detect_transform)
deployset = datasets.ImageFolder(deploy_path,transform = detect_transform)

# reduce size of deployment data (for computational time and memory)
n_deployment = 800
deploy_idx = np.random.choice(len(deployset.targets), size=n_deployment, replace=False)
deployset = Subset(deployset, deploy_idx)

print("Training set size: {}\nValidation set size: {}\nDeployment set size: {}".format(len(trainset.targets),len(valset.targets),n_deployment))
print(trainset.class_to_idx)

Training set size: 251
Validation set size: 66
Deployment set size: 800
{'Covid': 0, 'Normal': 1, 'Viral Pneumonia': 2}


# Detection for model adjustment: Model-based

In [5]:
class SimCLR(pl.LightningModule):

    def __init__(self, hidden_dim, lr, temperature, weight_decay, pretrained_model=None, max_epochs=500):
        super().__init__()
        self.save_hyperparameters()
        assert self.hparams.temperature > 0.0, 'The temperature must be a positive float!'
        # Base model f(.)
        self.model = pretrained_model if pretrained_model is not None else torchvision.models.resnet50(pretrained=True)  # Output of last linear layer: 2048-dim representation
        # print("Hi\n{}".format(list(self.model.children())))
        # The MLP for g(.) consists of Linear->ReLU->Linear
        # print(self.model.fc)
        # print(len(list(list(self.model.children())[-1].children())))
        # print(list(self.model.children())[-1])
        # print(list(list(self.model.children())[-1].children()))
        if len(list(list(self.model.children())[-1].children())) == 0:
            self.model = nn.Sequential(*(list(self.model.children())[:-1]),
                                        nn.Flatten(),
                                        nn.Linear(self.model.fc.in_features, 2048),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(2048, hidden_dim))
            print("done")
        # print("Hihi\n{}".format(list(self.model.children())))
    
    def forward(self, x):
        return self.model(x)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),
                                lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                            T_max=self.hparams.max_epochs,
                                                            eta_min=self.hparams.lr/50) # search what does it refer to
        return [optimizer], [lr_scheduler]

    def info_nce_loss(self, batch, mode='train'):
        imgs, _ = batch
        # print(imgs.shape)
        imgs = torch.cat(imgs, dim=0)
        # print(imgs.shape)

        # Encode all images
        feats = self(imgs)
        # print(feats.shape)
        # Calculate cosine similarity
        cos_sim = F.cosine_similarity(feats[:,None,:], feats[None,:,:], dim=-1)
        # Mask out cosine similarity to itself
        self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, device=cos_sim.device)
        cos_sim.masked_fill_(self_mask, -9e15)
        # Find positive example -> batch_size//2 away from the original example
        pos_mask = self_mask.roll(shifts=cos_sim.shape[0]//2, dims=0)
        # InfoNCE loss
        cos_sim = cos_sim / self.hparams.temperature
        nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1)
        nll = nll.mean()

        # Logging loss
        print(mode+'_loss: {:.4f}'.format(nll))
        self.log(mode+'_loss', nll)
        # Get ranking position of positive example
        comb_sim = torch.cat([cos_sim[pos_mask][:,None],  # First position positive example
                              cos_sim.masked_fill(pos_mask, -9e15)],
                             dim=-1)
        sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1)
        # Logging ranking metrics
        self.log(mode+'_acc_top1', (sim_argsort == 0).float().mean())
        self.log(mode+'_acc_top5', (sim_argsort < 5).float().mean())
        self.log(mode+'_acc_mean_pos', 1+sim_argsort.float().mean())

        return nll

    def training_step(self, batch, batch_idx):
        return self.info_nce_loss(batch, mode='train')

    def validation_step(self, batch, batch_idx):
        self.info_nce_loss(batch, mode='val')

In [6]:
simclr_model = SimCLR.load_from_checkpoint("../input/ssrchestxray/ssr-chest-x-ray.ckpt") # load pre-trained model
feature_model = torch.nn.Sequential(*(list(simclr_model.model.children())[:-3]))
# feature_model = simclr_model

FEATURE_SIZE = 2048

done


In [7]:
train_loader = DataLoader(trainset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
train_values = np.zeros((2*BATCH_SIZE, FEATURE_SIZE))

for i, (images, labels) in enumerate(tqdm(train_loader, total=int(len(train_loader)))):
    train_values[BATCH_SIZE*i:BATCH_SIZE*i+images.shape[0],:] = feature_model(images).detach().cpu().numpy()
    if i == 1:
        break

print(train_values.shape)

  0%|          | 0/8 [00:00<?, ?it/s]

(64, 2048)


In [8]:
val_loader = DataLoader(valset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
val_values = np.zeros((len(valset.targets), FEATURE_SIZE))

for i, (images, labels) in enumerate(tqdm(val_loader, total=int(len(val_loader)))):
    val_values[BATCH_SIZE*i:BATCH_SIZE*i+images.shape[0],:] = feature_model(images).detach().cpu().numpy()

print(val_values.shape)

  0%|          | 0/3 [00:00<?, ?it/s]

(66, 2048)


In [9]:
X_control = np.vstack((train_values, val_values))
y_control = np.hstack((np.ones(train_values.shape[0]), np.zeros(val_values.shape[0])))
# print(X_control.shape, y_control.shape) # Check if they have the correct shape
X_control_train, X_control_test, y_control_train, y_control_test = train_test_split(X_control, y_control, test_size=0.2, random_state=42)
control_model = RandomForestClassifier(random_state=42)
control_model.fit(X_control_train, y_control_train)
print("Classification score for drift control: {}".format(control_model.score(X_control_test, y_control_test)))

Classification score for drift control: 0.6538461538461539


In [10]:
deploy_loader = DataLoader(deployset,batch_size=BATCH_SIZE,shuffle=False,num_workers=4)
deploy_values = np.zeros((2*BATCH_SIZE, FEATURE_SIZE))

for i, (images, labels) in enumerate(tqdm(deploy_loader, total=int(len(deploy_loader)))):
    deploy_values[BATCH_SIZE*i:BATCH_SIZE*i+images.shape[0],:] = feature_model(images).detach().cpu().numpy()
    if i == 1:
        break

print(deploy_values.shape)

  0%|          | 0/25 [00:00<?, ?it/s]

(64, 2048)


In [11]:
X_detect = np.vstack((train_values, deploy_values))
y_detect = np.hstack((np.ones(train_values.shape[0]), np.zeros(deploy_values.shape[0])))
# print(X_detect.shape, y_detect.shape) # Check if they have the correct shape
X_detect_train, X_detect_test, y_detect_train, y_detect_test = train_test_split(X_detect, y_detect, test_size=0.2, random_state=42)
detection_model = RandomForestClassifier(random_state=42)
detection_model.fit(X_detect_train, y_detect_train)
print("Classification score for drift detection: {}".format(detection_model.score(X_detect_test, y_detect_test)))

X_check = np.vstack((val_values, deploy_values))
y_check = np.hstack((np.ones(val_values.shape[0]), np.zeros(deploy_values.shape[0])))
# print(X_check.shape, y_check.shape) # Check if they have the correct shape
X_check_train, X_check_test, y_check_train, y_check_test = train_test_split(X_check, y_check, test_size=0.2, random_state=42)
check_model = RandomForestClassifier(random_state=42)
check_model.fit(X_check_train, y_check_train)
print("Classification score for drift check: {}".format(check_model.score(X_check_test, y_check_test)))

Classification score for drift detection: 1.0
Classification score for drift check: 0.9230769230769231
