In [1]:
import os
import random
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import albumentations as albu
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as BaseDataset
from tqdm.notebook import trange, tqdm
from torch.utils.tensorboard import SummaryWriter
from torchsampler import ImbalancedDatasetSampler
from torchvision import models
import torchvision.transforms as transforms
#from torchsummary import summary
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, roc_curve, average_precision_score
from sklearn.model_selection import GroupShuffleSplit
from pydicom import dcmread
import glob
import pickle
import scipy
from efficientnet_pytorch import EfficientNet
from datetime import datetime
seed = 10
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [2]:
dataDF = pd.read_csv('forTrainingDataFold.csv', dtype={'StudyInstanceUID': 'string', 'SeriesInstanceUID':'string', 'SOPInstanceUID':'string'})
dataDF = dataDF.set_index('SOPInstanceUID')

In [3]:
dataDF.head()

Unnamed: 0_level_0,StudyInstanceUID,SeriesInstanceUID,pe_present_on_image,acute,chronic,leftsided_pe,rightsided_pe,central_pe,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,...,qa_contrast,flow_artifact,true_filling_defect_not_pe,negative_exam_for_pe,chronic_pe,acute_and_chronic_pe,indeterminate,img_pos,patient_position,fold
SOPInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c0f3cb036d06,6897fa9de148,2bfbb7fd2e8b,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,-234.5,HFS,3
f57ffd3883b6,6897fa9de148,2bfbb7fd2e8b,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,-252.5,HFS,3
41220fda34a3,6897fa9de148,2bfbb7fd2e8b,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,-432.5,HFS,3
13b685b4b14f,6897fa9de148,2bfbb7fd2e8b,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,-434.5,HFS,3
be0b7524ffb4,6897fa9de148,2bfbb7fd2e8b,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,-436.5,HFS,3


In [4]:
# Sanity Check
listOfStudyIDs = dataDF['StudyInstanceUID'].unique()

eachStudyID = listOfStudyIDs[0]
print(eachStudyID)
eachStudyDF = dataDF[dataDF['StudyInstanceUID']==eachStudyID]
sortedStudyDF = eachStudyDF.sort_values(by=['img_pos'], ascending=False)

pd.set_option('display.max_rows',None)
print(sortedStudyDF['img_pos'])

sortedList = sortedStudyDF['img_pos'].index.to_list()

6897fa9de148
SOPInstanceUID
c0f3cb036d06   -234.5
9e712b4d09a4   -236.5
d62e15417d41   -238.5
2a52ff46886c   -240.5
f8c32ebbf29a   -242.5
8bd638f6b65e   -244.5
5b8e6d8c26e6   -246.5
d9d5081f6ee8   -248.5
41984269c2ff   -250.5
f57ffd3883b6   -252.5
7496a1354775   -254.5
22fb80615fff   -256.5
c1fcb8dfa2d1   -258.5
2641adc29b84   -260.5
965e94785415   -262.5
a787ce664744   -264.5
b9c1845f4b9a   -266.5
ab994ef19adb   -268.5
f4594a785591   -270.5
822dd7790999   -272.5
04cb74d5008c   -274.5
148dfe893a66   -276.5
26fbb2a74642   -278.5
e93fc07d70a3   -280.5
53d9f36c01c3   -282.5
da97c4b2cc9c   -284.5
925fa5a0923e   -286.5
c89fbed94e97   -288.5
3338c96871be   -290.5
e8e41d4ae094   -292.5
27c48c3ac49e   -294.5
b973b5ef07f5   -296.5
a098c6594df8   -298.5
b5e4babf2887   -300.5
1f1a8108891c   -302.5
9b599fd58996   -304.5
ec62413321d3   -306.5
09bfdbc349e5   -308.5
5edaf0e45844   -310.5
d10f967890c7   -312.5
bbac9d6fe07a   -314.5
44135d227780   -316.5
9a83c7356ba8   -318.5
c1a5538bba20   -320.5
fd9d

# load model +/- modifying it to extract features

In [5]:
modelPath = 'models/CNNmodel/CNNmodel_01_epoch1_CV4_20201008_2252.pth'
modelName = modelPath.split('/')[-1][:-4]
print(modelName)
#model = torch.load(modelPath)
#print(model)

CNNmodel_01_epoch1_CV4_20201008_2252


In [None]:
#model.fc = nn.Sequential(*list(model.fc.children())[:-2])
#print(model)

In [None]:
#embedderModelPath = modelPath[:-4]+'_embedder.pth'
#torch.save(model,embedderModelPath)

In [6]:
model = torch.load('models/CNNmodel/CNNmodel_01_epoch1_CV4_20201008_2252_embedder.pth')

In [7]:
# Construct path mapping dict for jpg files 
listOfDCMfiles = glob.glob('data/train/*/*/*.dcm')

imageID2pathDict = {}

for eachPath in listOfDCMfiles:
    imageID = eachPath.split('/')[-1][:-4]
    imageID2pathDict[imageID] = eachPath

In [8]:
listOfImageID = list(imageID2pathDict.keys())
print(len(listOfImageID))

1790594


In [9]:
# helper functions

def window(img, WL=50, WW=350):
    upper, lower = WL+WW//2, WL-WW//2
    X = np.clip(img.copy(), lower, upper)
    X = X - np.min(X)
    X = X / np.max(X)
    X = (X*255.0).astype('uint8')
    return X

data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

def get_augmentation(numImages=0):
    train_transform = [
        albu.ShiftScaleRotate(scale_limit=0.05, rotate_limit=30, shift_limit=0.05, p=1, border_mode=0),
        albu.IAAAdditiveGaussianNoise(p=0.1),
        albu.IAAPerspective(p=0.2),

        albu.OneOf(
            [
                albu.CLAHE(p=1),
                albu.RandomBrightness(p=1),
                albu.RandomGamma(p=1),
            ],
            p=0.3,
        ),

        albu.OneOf(
            [
                albu.IAASharpen(p=1),
                albu.Blur(blur_limit=3, p=1),
                albu.MotionBlur(blur_limit=3, p=1),
            ],
            p=0.3,
        ),

        albu.OneOf(
            [
                albu.RandomContrast(p=1),
                albu.HueSaturationValue(p=1),
            ],
            p=0.3,
        ),
    ]
    return albu.Compose(train_transform)

def augmentAll(myImages):
    declareTargets = {'image{}'.format(index):'image' for index in range(len(myImages))}
    addnTargets = {'image{}'.format(index):image for index, image in enumerate(myImages)}
    addnTargets['image'] = myImages[0]
    myAug = albu.Compose(transforms=get_augmentation(), additional_targets=declareTargets)
    augmented = myAug(**addnTargets)
    augmentedImages = [augmented['image{}'.format(index)] for index in range(len(myImages))]
    return augmentedImages

# Helper functions for inference
def dcmToImage(imageID):
    dcm_data = dcmread(imageID2pathDict[imageID])
    image = dcm_data.pixel_array * int(dcm_data.RescaleSlope) + int(dcm_data.RescaleIntercept)
    image = np.stack([window(image, WL=-600, WW=1500),
                    window(image, WL=40, WW=400),
                    window(image, WL=100, WW=700)], 2)
    #image = image.astype(np.float32)
    return image

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [10]:
model = model.cuda()
model = model.eval()

In [11]:
def createEmbeddings(embeddingDirPath=None, augment=False):
    if not os.path.exists(embeddingSaveDir):
        os.makedirs(embeddingSaveDir)

    for eachStudyID in tqdm(listOfStudyIDs):
        eachStudyDF = dataDF[dataDF['StudyInstanceUID']==eachStudyID]
        sortedStudyDF = eachStudyDF.sort_values(by=['img_pos'], ascending=False)
        sortedList = sortedStudyDF['img_pos'].index.to_list()

        with torch.no_grad():
            embeddingVolume = []
            sortedImages = [dcmToImage(eachImageID) for eachImageID in sortedList]
            if augment == True:
                sortedImages = augmentAll(sortedImages)
            sortedImages = [eachImage.astype(np.float32) for eachImage in sortedImages]
            listOfTensors = [data_transform(eachImage) for eachImage in sortedImages]

            tensorChunkIterator = chunks(listOfTensors,128)
            
            embeddingList = []
            for eachChunk in tensorChunkIterator:
                stackedChunk = torch.stack(eachChunk, dim=0, out=None)
                stackedChunk = stackedChunk.cuda()
                embedding = model(stackedChunk)
                embeddingList.append(embedding)
            embeddingVol = torch.cat(embeddingList, dim=0)
            embeddingVol = embeddingVol.cpu().detach().numpy()
            
            dictToSave = {'ids':sortedList, 'embeddings':embeddingVol}
            pickle.dump(dictToSave, open(embeddingSaveDir+'/'+eachStudyID+'.p','wb'))

In [12]:
for eachIndex in range(5):
    embeddingSaveDir = 'data/embeddings/' + modelName + '/aug'+str(eachIndex).zfill(2)
    if not os.path.exists(embeddingSaveDir):
        os.makedirs(embeddingSaveDir)
    createEmbeddings(embeddingSaveDir,augment=True)

HBox(children=(FloatProgress(value=0.0, max=7279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7279.0), HTML(value='')))




# scrap codes

In [None]:
# Sanity Check

myEmbedding = pickle.load(open('data/embeddings/CNNmodel_01_epoch1_CV4_20201008_2252/val/6897fa9de148.p','rb'))

In [None]:
myEmbedding['embeddings'].shape

In [None]:
embeddingSaveDir = 'data/embeddings/CNNmodel_01_epoch1_20201005_1533/val/'

for eachStudyID in tqdm(listOfStudyIDs):
    eachStudyDF = dataDF[dataDF['StudyInstanceUID']==eachStudyID]
    sortedStudyDF = eachStudyDF.sort_values(by=['img_pos'], ascending=False)
    sortedList = sortedStudyDF['img_pos'].index.to_list()

    with torch.no_grad():
        embeddingVolume = []
        for eachImageID in sortedList:
            dcm_data = dcmread(imageID2pathDict[eachImageID])
            image = dcm_data.pixel_array * int(dcm_data.RescaleSlope) + int(dcm_data.RescaleIntercept)
            image = np.stack([window(image, WL=-600, WW=1500),
                            window(image, WL=40, WW=400),
                            window(image, WL=100, WW=700)], 2)
            image = image.astype(np.float32)
            image = data_transform(image)
            image = image.cuda()
            toPred = image.unsqueeze(0)
            embedding = model(toPred)
            embedding = embedding.detach().cpu().numpy()[0]
            embeddingVolume.append(embedding)
            dictToSave = {'ids':sortedList, 'embeddings':embeddingVolume}
            pickle.dump(dictToSave, open(embeddingSaveDir+eachStudyID+'.p','wb'))

In [None]:
# Sanity Check
# embeddingDict = pickle.load(open(embeddingSaveDir+eachStudyID+'.p','rb'))
# print(embeddingDict)