<a href="https://colab.research.google.com/github/vikranthkeerthipati/IEEECovidCTHackathon/blob/main/CovidCTHackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nibabel pandas torch opencv-python tqdm
import nibabel as nib
import matplotlib.pyplot as plt
import glob
import os
import tqdm
!ls /content/drive/MyDrive/COVID19_1110

dataset_registry.xlsx  masks	     README_EN.pdf  README_RU.pdf
LICENSE		       README_EN.md  README_RU.md   studies


In [None]:
# Downloading dataset and unzipping file
import os

root_dir = ""

if os.path.exists("/content/drive/MyDrive/COVID19_RGB"):
  root_dir = "/content/drive/MyDrive/COVID19_RGB"

elif os.path.exists("/content/drive/MyDrive/COVID19_1110"):
  root_dir = "/content/drive/MyDrive/COVID19_1110"

elif not os.path.exists("./COVID19_1110"):
  !wget https://storage.yandexcloud.net/covid19.1110/prod/COVID19_1110.zip
  !unzip COVID19_1110.zip
  root_dir = "./COVID19_1110"

print("Root Directory:",root_dir)


Root Directory: /content/drive/MyDrive/COVID19_RGB


In [None]:
# Loading images metadata
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import numpy as np
from PIL import Image
import cv2 
from nibabel.viewers import OrthoSlicer3D
from matplotlib import cm
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import transforms
import time
import copy
from tqdm import tqdm, tqdm_notebook

def create_processed_arr(img,root_dir=None):
  """
  Function to convert an image path or buffer into numpy array preprocessed
  # TODO: Define args and return type


  """
  # TODO: Data Augmentation
  # https://www.researchgate.net/post/Data-augmentation-techniques-for-medical-image
  preprocess = transforms.Compose([
  transforms.Resize(224),
  transforms.ToTensor(),
  ])
  im = None
  if root_dir:
    im = Image.open(root_dir+img).convert("RGB")
  else:
    im = Image.open(img).convert("RGB")
  fin_array = preprocess(im)
  return fin_array

# TODO; Changing grayscale conversion (without matplotlib): @Yujin
def convert_to_rgb(x,root_dir):
  """
  Converting .nii.gz files into one 2D slice in RGB format
  # TODO: Define args and return type

  """
  nib_img = nib.load(root_dir + "/%s" % x)
  np_array = nib_img.get_fdata()
  img = np_array[:,:,np_array.shape[-1] // 2]
  fig = plt.figure(figsize=(224,224))
  import io
  buf = io.BytesIO()
  plt.imsave(buf, img, cmap="gray")
  buf.seek(0)
  fin_array = create_processed_arr(buf)
  buf.close()
  plt.close(fig)
  return fin_array


class CovidCTDataset(Dataset):
  """
    Args:
        excel_file (string): Path to the excel file with annotations.
        root_dir (string): Directory with all the images.
        transform (callable, optional): Optional transform to be applied
            on a sample.
  """
  def __init__(self, excel_file, root_dir, transform=None):
    tqdm_notebook().pandas()
    self.ct_df = pd.read_excel(excel_file)
    # Integer encode CT-0 to 0 and CT-1> to 1
    self.ct_df["identification"] = pd.Series(np.where(self.ct_df.category.values == 'CT-0', 0, 1),
          self.ct_df.index)
    self.ct_df = self.ct_df.sample(frac=1)
    if "COVID19_RGB" in root_dir:
        self.ct_df["img_arr"] = self.ct_df.study_file.progress_apply(create_processed_arr,args=(root_dir,))
    else:
      self.ct_df["img_arr"] = self.ct_df.study_file.progress_apply(convert_to_rgb,args=(root_dir,))
    self.root_dir = root_dir
    self.transform = transform

  def __len__(self):
    return len(self.ct_df)

  def __getitem__(self, idx):
    ct_item = self.ct_df.iloc[idx]
    label = ct_item["identification"]
    fin_array = ct_item["img_arr"]
    result = (fin_array, torch.tensor(label))
    return result

ct_dataset = CovidCTDataset(root_dir+"/dataset_registry.xlsx",root_dir,transform=True)

train_set_size = int(len(ct_dataset) * 0.7)

# TODO: Train, Val, Test split
train_set, test_set = torch.utils.data.random_split(ct_dataset, [train_set_size, len(ct_dataset) - train_set_size])

train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_set, batch_size=16, shuffle=True, num_workers=0)

classes = ("Non-COVID", "COVID")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

resnet_model = models.resnet50(pretrained=True)

# TODO: After each epoch, test each epoch
# From PyTorch Tutorial https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html with modifications
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in tqdm_notebook(range(0,num_epochs)):
      
      # Setting model to training mode
        model.train()

        running_loss = 0.0
        running_corrects = 0
        
          # Iterate over training data.
        for inputs in train_dataloader:
            inputs_dev = inputs[0].to(device)
            labels_dev = inputs[1].to(device)
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history
            with torch.set_grad_enabled(True):
                outputs = model(inputs_dev)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels_dev)
    
                # backward + optimize
                loss.backward()
                optimizer.step()

            # statistics
            running_loss += loss.item() * inputs[0].size(0)
            running_corrects += torch.sum(preds == labels_dev.data)
            scheduler.step()
        epoch_loss = running_loss / train_set_size
        epoch_acc = running_corrects.double() / train_set_size

        print('Loss: {:.4f} Acc: {:.4f}'.format(
            epoch_loss, epoch_acc))

        # deep copy the model
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Freeze weights on all layers except the FC layer
for param in resnet_model.parameters():
  param.requires_grad = False

# Obtain the number of features the pretrained FC layer has as input
num_ftrs = resnet_model.fc.in_features

# Here the size of each output sample is set to 2.
resnet_model.fc = nn.Linear(num_ftrs, 2)

# Set the loss function to be Cross Entropy
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
# optimizer_ft = optim.SGD(resnet_model.parameters(), lr=0.001, momentum=0.9)

optimizer_ft = optim.Adam(resnet_model.parameters(), lr=0.001)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

# Move model onto device
resnet_model = resnet_model.to(device)

# Train the model via the train_model function
model_ft = train_model(resnet_model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)




# TODO: Postprocess to create ROC curve @Yuyan
# https://torchmetrics.readthedocs.io/en/latest/references/modules.html#auroc






Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

  0%|          | 0/1110 [00:00<?, ?it/s]