In [1]:
class CFG:
    debug=False
    image_size=360
    batch_size=1
    seed=2020
    N = 36

num_workers=0
quantiles = (0.2, 0.5, 0.8)
HM_SLICES = 40

In [2]:
import os
import numpy as np 
import pandas as pd 
from pathlib import Path

In [3]:
test_dir = '../input/osic-pulmonary-fibrosis-progression/test/'
outdir = Path('.')

dicom_arrays_dir = Path('/kaggle/dicom_arrays/')
os.makedirs(dicom_arrays_dir, exist_ok=True)

latent_dir = Path('/kaggle/features_dir/')
os.makedirs(latent_dir, exist_ok=True)

mask_dir = Path('/kaggle/masks/')
os.makedirs(mask_dir, exist_ok=True)

cache_dir = None
volume_array_file = Path('volume_array.pt')
kurts_array_file = Path('kurts_array.pt')
skews_array_file = Path('skews_array.pt')
means_array_file = Path('mean_array.pt')
stds_array_file = Path('std_array.pt')
medians_array_file = Path('median_array.pt')


model_dir = '../input/ensemble-models/ensemble_models/ensemble_models/0'
autoencoder_dir = '../input/best-autoencoder-models'

MODELS = []
for filename in os.listdir(model_dir):
    if filename.endswith(".pt"): 
        print(os.path.join(model_dir, filename))
        MODELS.append(os.path.join(model_dir, filename))
        
MODELS1 = []
for filename in os.listdir(autoencoder_dir):
    if filename.endswith(".pt"): 
        print(os.path.join(autoencoder_dir, filename))
        MODELS1.append(os.path.join(autoencoder_dir, filename))

../input/ensemble-models/ensemble_models/ensemble_models/0/model_fold_3.pt
../input/ensemble-models/ensemble_models/ensemble_models/0/model_fold_0.pt
../input/ensemble-models/ensemble_models/ensemble_models/0/model_fold_1.pt
../input/ensemble-models/ensemble_models/ensemble_models/0/model_fold_2.pt
../input/ensemble-models/ensemble_models/ensemble_models/0/model_fold_4.pt
../input/best-autoencoder-models/model_fold_0.pt
../input/best-autoencoder-models/model_fold_1.pt
../input/best-autoencoder-models/model_fold_2.pt


In [4]:
patient_files = list(os.listdir(test_dir))
print("Number of folders:", len(patient_files))

Number of folders: 5


In [5]:
patient_files[0]

'ID00426637202313170790466'

In [6]:
# ====================================================
# Library
# ====================================================

import sys

import gc
import os
import random
import time
import math
import pydicom
from time import perf_counter
from contextlib import contextmanager
from pathlib import Path
from collections import defaultdict, Counter
from IPython.core.display import display, HTML
from multiprocessing import Pool

import cv2
from PIL import Image, ImageOps, ImageEnhance
import numpy as np
import pandas as pd
import scipy
import scipy as sp
import matplotlib.pyplot as plt

import skimage.io
from skimage import measure, feature, morphology
from skimage.util import montage
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.filters import roberts, sobel
from skimage.segmentation import clear_border
from skimage import data
from scipy import ndimage as ndi
from scipy.stats import skew, kurtosis

import sklearn.metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold, train_test_split

from functools import partial
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import init, Sequential
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from torch.autograd import Variable

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [7]:
# ====================================================
# Utils
# ====================================================

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')

    
def init_logger(log_file='test.log'):
    from logging import getLogger, DEBUG, FileHandler,  Formatter,  StreamHandler
    
    log_format = '%(asctime)s %(levelname)s %(message)s'
    
    stream_handler = StreamHandler()
    stream_handler.setLevel(DEBUG)
    stream_handler.setFormatter(Formatter(log_format))
    
    file_handler = FileHandler(log_file)
    file_handler.setFormatter(Formatter(log_format))
    
    logger = getLogger('fibrosis')
    logger.setLevel(DEBUG)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger

LOG_FILE = 'train.log'
LOGGER = init_logger(LOG_FILE)


# def seed_torch(seed=2020):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True

# seed_torch(seed=2020)

In [8]:
train_csv = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
train_csv = train_csv.drop_duplicates(subset=['Patient', 'Weeks'])
test_csv = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
sub_csv = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

sub_csv['Weeks']   = sub_csv['Patient_Week'].apply( lambda x: int(x.split('_')[-1]) )
sub_csv['Patient'] = sub_csv['Patient_Week'].apply( lambda x: x.split('_')[0] ) 
sub_csv =  sub_csv[['Patient','Weeks','Confidence','Patient_Week']]
sub_csv = sub_csv.merge(test_csv.drop('Weeks', axis=1), on="Patient")

train_csv['WHERE'] = 'train'
test_csv['WHERE'] = 'val'
sub_csv['WHERE'] = 'test'
data = train_csv.append([sub_csv, test_csv])

In [9]:
columns = data.keys()
columns = list(columns)
print(columns)

print(train_csv.shape, test_csv.shape, sub_csv.shape, data.shape)
print(train_csv.Patient.nunique(), test_csv.Patient.nunique(), sub_csv.Patient.nunique(), 
      data.Patient.nunique())

['Patient', 'Weeks', 'FVC', 'Percent', 'Age', 'Sex', 'SmokingStatus', 'WHERE', 'Confidence', 'Patient_Week']
(1542, 8) (5, 8) (730, 10) (2277, 10)
176 5 5 176


In [10]:
data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,Patient_Week
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,


In [11]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [12]:
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)

In [13]:
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']
data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,Patient_Week,...,base_week,Male,Female,Ex-smoker,Never smoked,Currently smokes,age,BASE,week,percent
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,train,,,...,0.0,1,0,1,0,0,0.769231,0.241456,0.179012,0.236393
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,train,,,...,9.0,1,0,1,0,0,0.769231,0.241456,0.234568,0.215941
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,train,,,...,11.0,1,0,1,0,0,0.769231,0.241456,0.246914,0.18496
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,train,,,...,13.0,1,0,1,0,0,0.769231,0.241456,0.259259,0.201767
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,train,,,...,15.0,1,0,1,0,0,0.769231,0.241456,0.271605,0.18658


In [14]:
test_df = data.loc[data.WHERE=='test'].reset_index()
del data

test_df.shape

(730, 23)

In [15]:
test_df.head()

Unnamed: 0,index,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,...,base_week,Male,Female,Ex-smoker,Never smoked,Currently smokes,age,BASE,week,percent
0,1542,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,-18.0,1,0,1,0,0,0.615385,0.3724,0.067901,0.332421
1,1543,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,-17.0,1,0,1,0,0,0.615385,0.3724,0.074074,0.332421
2,1544,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,-16.0,1,0,1,0,0,0.615385,0.3724,0.080247,0.332421
3,1545,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,-15.0,1,0,1,0,0,0.615385,0.3724,0.08642,0.332421
4,1546,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,-14.0,1,0,1,0,0,0.615385,0.3724,0.092593,0.332421


# CT scan pre-processing

Credits to https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial and https://www.kaggle.com/arnavkj95/candidate-generation-and-luna16-preprocessing

In [16]:
def load_scan(path):
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.InstanceNumber))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except NameError:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    except:
        slice_thickness = slices[0].SliceThickness
        
    if slice_thickness==0:
            slice_thickness=slices[0].SliceThickness
    for s in slices:
        s.SliceThickness = slice_thickness
    
    return slices

In [17]:
def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16
    image = image.astype(np.int16)
    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
        image[slice_number] += np.int16(intercept)
    return np.array(image, dtype=np.int16)

def window_image(image, window_center, window_width):
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    window_image = image.copy()
    window_image[window_image < img_min] = img_min
    window_image[window_image > img_max] = img_max
    
    return window_image

In [18]:
def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)

    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor, mode='nearest')
    
    return image, new_spacing

In [19]:
def get_segmented_lungs(im, threshold):
    '''
    Step 1: Convert into a binary image. 
    '''
    binary = np.array(im < threshold, dtype=np.int8)
    '''
    Step 2: Remove the blobs connected to the border of the image.
    '''
    cleared = clear_border(binary)
    '''
    Step 3: Label the image.
    '''
    label_image = label(cleared)
    '''
    Step 4: Keep the labels with 2 largest areas.
    '''
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    '''
    Step 5: Erosion operation with a disk of radius 2. This operation is 
    seperate the lung nodules attached to the blood vessels.
    '''
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    '''
    Step 6: Closure operation with a disk of radius 10. This operation is 
    to keep nodules attached to the lung wall.
    '''
    selem = disk(10)
    binary = binary_closing(binary, selem)
    '''
    Step 7: Fill in the small holes inside the binary mask of lungs.
    '''
    edges = roberts(binary)
    binary = ndi.binary_fill_holes(edges)
    '''
    Step 8: Superimpose the binary mask on the input image.
    '''
#     get_high_vals = binary == 0
#     im[get_high_vals] = 0
    im = binary* im
        
    return im, binary.astype(int)

In [20]:
#MIN_BOUND = -1000.0
#MAX_BOUND = 320.0
    
def normalize(image, MIN_BOUND, MAX_BOUND):
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image>1] = 1.
    image[image<0] = 0.
    return image

def lung_volume(masks, spacing):
    slice_thickness = spacing[0]
    pixel_spacing = (spacing[1], spacing[2])
    
    return np.round(np.sum(masks) * slice_thickness * pixel_spacing[0]*pixel_spacing[1], 3)

def lung_process(image, spacing, threshold):
    segmented = []
    masks = []
    for im in image:
        segment,mask = get_segmented_lungs(im,threshold)
        masks.append(mask.astype(int))
        segmented.append(segment)
    #vol = lung_volume(np.asarray(masks), spacing)
    return np.asarray(segmented), np.asarray(masks)

def compute_stats(img):
    kurt = kurtosis(img.ravel()[img.ravel() <0.6])
    ske = skew(img.ravel()[img.ravel() <0.6])

    std_i = img.ravel()[img.ravel() <0.6].std()
    mean_i = img.ravel()[img.ravel() <0.6].mean()
    median_i = np.median(img.ravel()[img.ravel() <0.6])
    return kurt, ske, std_i, mean_i, median_i

In [21]:
def chunks(l, n):
    # Credit: Ned Batchelder
    # Link: http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def mean(l):
    return sum(l) / len(l)

def reduce_slices(slices):
    new_slices = []
    chunk_sizes = math.ceil(len(slices) / HM_SLICES)
    for slice_chunk in chunks(slices, chunk_sizes):
        slice_chunk = list(map(mean, zip(*slice_chunk)))
        new_slices.append(slice_chunk)

    if len(new_slices) == HM_SLICES-1:
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES-2:
        new_slices.append(new_slices[-1])
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES+2:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES],])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val

    if len(new_slices) == HM_SLICES+1:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES],])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val
    return new_slices

In [22]:
def preprocess_file(patient_id):
    patient = load_scan(test_dir + patient_id)
    patient_pixels = get_pixels_hu(patient)
    
    if patient_pixels.mean()<-1500 and patient_pixels.mean()>=-1800:
        lung_image = window_image(patient_pixels, -1500, 3000)
        pix_resampled, spacing = resample(lung_image, patient, [1,1,1])
        segmented, mask = lung_process(pix_resampled, spacing, -1400)
        normalized = normalize(segmented, -3000, 1500)
        
    elif patient_pixels.mean()<-1800:
        lung_image = window_image(patient_pixels, -3000, 4500)
        pix_resampled, spacing = resample(lung_image, patient, [1,1,1])
        segmented, mask = lung_process(pix_resampled, spacing, -2200)
        normalized = normalize(segmented, -4000, 300)
        
    else:
        lung_image = window_image(patient_pixels, -300, 1200)
        pix_resampled, spacing = resample(lung_image, patient, [1,1,1])
        segmented, mask = lung_process(pix_resampled, spacing, -200)
        normalized = normalize(segmented, -1500, 900)
        
    return normalized.astype(np.float16), mask

In [23]:
save_img = dicom_arrays_dir
save_mask = mask_dir
def save_arrays(patient_ids):
    segmented, mask = preprocess_file(patient_ids)
    array_path = f'{save_img}/{patient_ids}.npy'
    mask_path = f'{save_mask}/{patient_ids}_mask.npy'
    
    np.save(str(array_path), segmented)
    np.save(str(mask_path), mask)
    gc.collect()

def cache_dataset():
    patient_ids = test_df.drop_duplicates(subset=['Patient']).Patient

    with Pool(processes=4) as pool:
        show_run_results = list(
            tqdm(pool.imap(save_arrays, patient_ids), total = len(patient_ids))
        )

In [24]:
patient_df = test_df.copy()
patient_df = patient_df.drop_duplicates(subset=['Patient'])
print(len(patient_df))

5


In [25]:
if volume_array_file.exists() and kurts_array_file.exists() and skews_array_file():
    print('loading pre-calculated arrays')
    volumes = torch.load(volume_array_file)
    kurts = torch.load(kurts_array_file)
    skews = torch.load(skews_array_file)
    means = torch.load(means_array_file)
    stds = torch.load(stds_array_file)
    medians = torch.load(medians_array_file)
else:
    print('Processing dicom images and caching dataset...')
    volumes = []
    kurts = []
    skews = []
    means = []
    stds = []
    medians = []
    
    cache_dataset()
    print('Calculating image statistics...')
    
    for i, patient_id in tqdm(enumerate(patient_df.Patient), total=len(patient_df.Patient)):
        segmented = []
        cached_img_path = f'{dicom_arrays_dir}/{patient_id}.npy'
        cached_mask_file = mask_dir/f'{patient_id}_mask.npy'
        
        img_array = np.load(cached_img_path)
        mask = np.load(cached_mask_file)

        vol = lung_volume(np.asarray(mask), (1,1,1))
        kurt, ske, std_i, mean_i, median_i = compute_stats(img_array)

        volumes.append(vol)
        
        means.append(mean_i)
        stds.append(std_i)
        medians.append(median_i)
        kurts.append(kurt)
        skews.append(ske)
        
        gc.collect()

    torch.save(volumes, 'volume_array.pt')
    torch.save(kurts, 'kurts_array.pt')
    torch.save(skews, 'skews_array.pt')
    torch.save(means, 'mean_array.pt')
    torch.save(stds, 'std_array.pt')
    torch.save(medians, 'median_array.pt')

Processing dicom images and caching dataset...


100%|██████████| 5/5 [05:25<00:00, 65.10s/it] 
  0%|          | 0/5 [00:00<?, ?it/s]

Calculating image statistics...


100%|██████████| 5/5 [00:32<00:00,  6.51s/it]


In [26]:
patient_df["volume"] = np.asarray(volumes)/1e6
patient_df["kurts"] = kurts
patient_df["skews"] = skews
patient_df["mean_vals"] = means
#patient_df["std_vals"] = stds
#patient_df["median_vals"] = medians

patient_df['kurts'].fillna((patient_df['kurts'].mean()), inplace=True)
patient_df['skews'].fillna((patient_df['skews'].mean()), inplace=True)
patient_df['mean_vals'].fillna((patient_df['mean_vals'].mean()), inplace=True)
#patient_df['median_vals'].fillna((patient_df['median_vals'].mean()), inplace=True)
FE += ['kurts','skews','mean_vals']

patient_df.head()

Unnamed: 0,index,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,...,Never smoked,Currently smokes,age,BASE,week,percent,volume,kurts,skews,mean_vals
0,1542,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.067901,0.332421,26.242649,8.228007,3.074219,0.278076
146,1688,ID00421637202311550012437,-12,2739,82.045291,68,Male,Ex-smoker,test,100.0,...,0,0,0.487179,0.320208,0.012346,0.427848,8.185682,6.047752,2.390625,0.282959
292,1834,ID00422637202311677017371,-12,1930,76.672493,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.169948,0.067901,0.384612,2.608509,2.883768,1.796875,0.308838
438,1980,ID00423637202312137826377,-12,3294,79.258903,72,Male,Ex-smoker,test,100.0,...,0,0,0.589744,0.423291,0.0,0.405425,3.542903,2.10927,1.545898,0.312744
584,2126,ID00426637202313170790466,-12,2925,71.824968,73,Male,Never smoked,test,100.0,...,1,0,0.615385,0.354755,0.104938,0.345604,4.012244,1.562883,1.491211,0.316406


In [27]:
test_df=test_df.merge(patient_df[['Patient','kurts','skews','mean_vals','volume']],how='left',on='Patient')

test_df.head()

Unnamed: 0,index,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,...,Never smoked,Currently smokes,age,BASE,week,percent,kurts,skews,mean_vals,volume
0,1542,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.067901,0.332421,8.228007,3.074219,0.278076,26.242649
1,1543,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.074074,0.332421,8.228007,3.074219,0.278076,26.242649
2,1544,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.080247,0.332421,8.228007,3.074219,0.278076,26.242649
3,1545,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.08642,0.332421,8.228007,3.074219,0.278076,26.242649
4,1546,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.092593,0.332421,8.228007,3.074219,0.278076,26.242649


In [29]:
class AutoEncoder(nn.Module):
    def __init__(self, latent_features=10):
        super(AutoEncoder, self).__init__()
        # Encoder
        self.conv1 = nn.Conv3d(1, 16, 3)
        self.conv2 = nn.Conv3d(16, 32, 3)
        self.conv3 = nn.Conv3d(32, 96, 2)
        self.conv4 = nn.Conv3d(96, 1, 1)
        self.pool1 = nn.MaxPool3d(kernel_size=2, stride=2, return_indices=True)
        self.pool2 = nn.MaxPool3d(kernel_size=3, stride=3, return_indices=True)
        self.pool3 = nn.MaxPool3d(kernel_size=2, stride=2, return_indices=True)
        self.pool4 = nn.MaxPool3d(kernel_size=2, stride=2, return_indices=True)
        self.fc1 = nn.Linear(10 * 10, latent_features)
        # Decoder
        self.fc2 = nn.Linear(latent_features, 10 * 10)
        self.deconv0 = nn.ConvTranspose3d(1, 96, 1)
        self.deconv1 = nn.ConvTranspose3d(96, 32, 2)
        self.deconv2 = nn.ConvTranspose3d(32, 16, 3)
        self.deconv3 = nn.ConvTranspose3d(16, 1, 3)
        self.unpool0 = nn.MaxUnpool3d(kernel_size=2, stride=2)
        self.unpool1 = nn.MaxUnpool3d(kernel_size=2, stride=2)
        self.unpool2 = nn.MaxUnpool3d(kernel_size=3, stride=3)
        self.unpool3 = nn.MaxUnpool3d(kernel_size=2, stride=2)

    def encode(self, x, return_partials=True):
        # Encoder
        x = self.conv1(x)
        up3out_shape = x.shape
        x, i1 = self.pool1(x)

        x = self.conv2(x)
        up2out_shape = x.shape
        x, i2 = self.pool2(x)

        x = self.conv3(x)
        up1out_shape = x.shape
        x, i3 = self.pool3(x)

        x = self.conv4(x)
        up0out_shape = x.shape
        x, i4 = self.pool4(x)

        x = x.view(-1, 10 * 10)
        x = F.relu(self.fc1(x))

        if return_partials:
            return x, up3out_shape, i1, up2out_shape, i2, up1out_shape, i3, \
                   up0out_shape, i4

        else:
            return x

    def forward(self, x):
        x, up3out_shape, i1, up2out_shape, i2, \
        up1out_shape, i3, up0out_shape, i4 = self.encode(x)

        # Decoder
        x = F.relu(self.fc2(x))
        x = x.view(-1, 1, 1, 10, 10)
        x = self.unpool0(x, output_size=up0out_shape, indices=i4)
        x = self.deconv0(x)
        x = self.unpool1(x, output_size=up1out_shape, indices=i3)
        x = self.deconv1(x)
        x = self.unpool2(x, output_size=up2out_shape, indices=i2)
        x = self.deconv2(x)
        x = self.unpool3(x, output_size=up3out_shape, indices=i1)
        x = self.deconv3(x)

        return x

In [30]:
class RNNFeatures(nn.Module):    
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, in_ctscan_features=10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.in_ctscan_features = in_ctscan_features
        self.match_sz = nn.Linear(in_ctscan_features, input_dim)
        
        self.rnn = nn.RNN(input_dim*2, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu',dropout=0.1)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        #self.batch_size = None
        #self.hidden = None
    
    def forward(self, x1, x2):
        x1 = x1.view(-1, len(x1), len(x1[0]))
        x2 = F.relu(self.match_sz(x2))
        x2 = x2.view(-1, len(x2), len(x2[0]))
        
        x = torch.cat([x1, x2], dim=2)
        
        h0 = self.init_hidden(x)
        out, hn = self.rnn(x, h0)
        out = F.relu(self.fc(out[:, -1, :]))
        out = self.fc_out(out)
        return out
    
    def init_hidden(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return h0

In [31]:
autoencoder_models = []
for path in MODELS1:
    state_dict = torch.load(path,map_location=torch.device('cpu'))
    model = AutoEncoder()
    model.load_state_dict(state_dict)
    model.to(device)
    model.float()
    model.eval()
    autoencoder_models.append(model)

In [32]:
models = []
for path in MODELS:
    #state_dict = torch.load(path,map_location=torch.device('cpu'))
    model = RNNFeatures(12, 150, 2, 3).to(device)
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.float()
    model.eval()
    models.append(model)

In [33]:
# Helper function that generates all latent features
class GenerateLatentFeatures:
    def __init__(self, autoencoder_models, latent_dir):
        #self.df = df.drop_duplicates(subset=['Patient'])
        self.latent_dir = Path(latent_dir)
        #self.cache_dir = Path(cache_dir)

    def __call__(self, img_id, img_array):
        cached_latent_file = self.latent_dir/f'{img_id}_lat.npy'

        if cached_latent_file.is_file():
            #latent_features = torch.load(cached_latent_file, map_location=torch.device('cpu'))
            latent_features = np.load(cached_latent_file)
        else:
            latent_features = []

            if len(img_array)>HM_SLICES:
                img_array = np.asarray(reduce_slices(img_array))
                if len(img_array) < HM_SLICES:
                   img_array = np.pad(img_array,[[0,HM_SLICES-len(img_array)],[0,0],[0,0]],constant_values=0.0)
            else:
                if len(img_array) < HM_SLICES:
                   img_array = np.pad(img_array,[[0,HM_SLICES-len(img_array)],[0,0],[0,0]],constant_values=0.0)

            img = torch.tensor(img_array).unsqueeze(0).float()
            img = F.interpolate(img, size=256)
            img = img.view(img.shape[0], 1, img.shape[1], img.shape[2], img.shape[3])
            img = torch.tensor(img).to(device)

            preds = 0.0
            with torch.no_grad():
                for model in autoencoder_models:
                    pred = model.encode(img, return_partials=False).squeeze(0)
                    preds+=pred.detach().cpu().numpy()
                preds = preds/len(autoencoder_models)
            latent_features.append(preds)

            latent_features = np.concatenate(latent_features)
            np.save(cached_latent_file, latent_features)
            
        return latent_features

In [34]:
class fibrosisDataset(Dataset):
    def __init__(self,
                 df,
                 rand=False,
                 mode='train',
                 extract_features=None,
                ):

        self.df = df.sort_values(by=['Patient','Weeks'],ascending=True).reset_index(drop=True)
        self.rand = rand
        self.mode = mode
        self.extract_features = extract_features

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.Patient
        
        label = row.FVC
        
        file_path = f'{dicom_arrays_dir}/{img_id}.npy'
        img_array = np.load(file_path)
        
        tabular_data = row[FE]     
        
        if self.extract_features:
            features = self.extract_features(img_id, img_array)
        
        if self.mode=='train' or self.mode=='valid':
            return torch.tensor(tabular_data), torch.tensor(label), torch.tensor(features)
        else:
            return torch.tensor(tabular_data), torch.tensor(features)

In [35]:
test_df.head()

Unnamed: 0,index,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,WHERE,Confidence,...,Never smoked,Currently smokes,age,BASE,week,percent,kurts,skews,mean_vals,volume
0,1542,ID00419637202311204720264,-12,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.067901,0.332421,8.228007,3.074219,0.278076,26.242649
1,1543,ID00419637202311204720264,-11,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.074074,0.332421,8.228007,3.074219,0.278076,26.242649
2,1544,ID00419637202311204720264,-10,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.080247,0.332421,8.228007,3.074219,0.278076,26.242649
3,1545,ID00419637202311204720264,-9,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.08642,0.332421,8.228007,3.074219,0.278076,26.242649
4,1546,ID00419637202311204720264,-8,3020,70.186855,73,Male,Ex-smoker,test,100.0,...,0,0,0.615385,0.3724,0.092593,0.332421,8.228007,3.074219,0.278076,26.242649


In [38]:
def test():
    test_dataset = fibrosisDataset(test_df, mode='test', extract_features=GenerateLatentFeatures(autoencoder_models, latent_dir))
    
    avg_preds = np.zeros((len(test_dataset), len(quantiles)))
    PREDS = []

    dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                                          num_workers=num_workers, pin_memory=False)

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    preds = []
    for i, batch in bar:
        preds = 0
        inputs = batch[0].float()
        features = batch[1].float()
        with torch.no_grad():
            for model in models:
                x = model(inputs, features)
                preds+=x
            preds /= len(models)
        PREDS.append(preds)

    avg_preds = torch.cat(PREDS, dim=0).numpy()

    df = pd.DataFrame(data=avg_preds, columns=list(quantiles))
    return df

In [40]:
df = test()

100%|██████████| 730/730 [00:59<00:00, 12.26it/s]


In [41]:
sub_file = sub_csv[['Patient_Week', 'FVC', 'Confidence']]
sub_file['FVC'] = df[quantiles[1]]
sub_file['Confidence'] = df[quantiles[2]] - df[quantiles[0]]

sub_file.head()
sub_file.to_csv('submission.csv', index=False)

In [42]:
sub = pd.read_csv('submission.csv')
sub.head()

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,2900.3438,264.5603
1,ID00419637202311204720264_-11,2898.0027,264.36816
2,ID00419637202311204720264_-10,2895.6626,264.17554
3,ID00419637202311204720264_-9,2893.3218,263.98315
4,ID00419637202311204720264_-8,2890.9814,263.79102
