In [1]:
# !pip install ../input/rsna-pip-wheels-v3/segmentation_models_pytorch-0.3.0-py3-none-any.whl --no-index --find-links=../input/rsna-pip-wheels-v3 -q

In [None]:
### please specify your input path here
PROJECT_FOLDER = "YOUR_PROJECT_FOLDER" # parent folder of the input images
IMAGE_DATA_FOLDER = PROJECT_FOLDER + "images/" # folder of the input images
INPUT_TEST_CSV_FILE = "YOUR_TEST_FILE" # csv file list locations / paths to test cases (dicom)
OUTPUT_FILE = "YOUR_OUTPUT" # in csv format

In [2]:
# !pip install ../input/rsna-pip-wheels-v3/timm-0.6.11-py3-none-any.whl -q

In [3]:
# !pip install ../input/rsna-pip-wheels-v3/python_gdcm-3.0.19-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl -q

In [4]:
# !pip install ../input/rsna-pip-wheels-v3/pylibjpeg-1.4.0-py3-none-any.whl

In [5]:
import re
%load_ext autoreload
%autoreload 2

In [None]:
import timm
timm.__version__

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import importlib

import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import glob
from scipy.special import expit

import cv2

cv2.setNumThreads(0)

from os.path import join

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


if not os.path.exists('tmp'):
    os.makedirs('tmp')

In [8]:
sys.path.append('./rsna2022-codebase/configs')
sys.path.append('./rsna2022-codebase/data')
sys.path.append('./rsna2022-codebase/models')
sys.path.append('./rsna2022-codebase/postprocess')
sys.path.append('./rsna2022-stage1-s1b')

In [9]:
test_df = pd.read_csv(INPUT_TEST_CSV_FILE)

print('test shape:', test_df.shape)

test shape: (5161, 3)


In [None]:
test_df.head()

In [11]:
if len(test_df)==3:
    # Fix mismatch with test_images folder
    test_df = pd.DataFrame(columns = ['row_id','StudyInstanceUID','prediction_type'])
    for i in ['1.2.826.0.1.3680043.22327','1.2.826.0.1.3680043.25399','1.2.826.0.1.3680043.5876']:
        for j in ['C1','C2','C3','C4','C5','C6','C7','patient_overall']:
            test_df = test_df.append({'row_id':i+'_'+j,'StudyInstanceUID':i,'prediction_type':j},ignore_index=True)

    IS_PUBLIC = True
else:
    IS_PUBLIC = False

In [12]:
# means = train_df[[
#     "C1",
#     "C2",
#     "C3",
#     "C4",
#     "C5",
#     "C6",
#     "C7",
#     "patient_overall"
# ]].mean(numeric_only=True)
means = [0.072313, 0.141159, 0.036157, 0.053492, 0.080238, 0.137197, 0.194651, 0.475978]

In [None]:
test_df.head(10)

In [14]:
images = []
study_ids = []
frame_ids = []

for index, row in test_df.iterrows():
    folder = row['image_folder']
    uid = row['StudyInstanceUID']
    
    files = [x for x in glob.glob(f"{folder}/*.d*")]
    images += files
    frame_ids += [int(re.findall(r'(\d+).d', x)[0]) for x in files]
    study_ids += [uid] * len(files)

df = pd.DataFrame({
    "Image": images,
    "StudyInstanceUID": study_ids,
    "frame": frame_ids
}).sort_values(["StudyInstanceUID", "frame"]).reset_index(drop=True)

df[['label_frac_c1', 'label_frac_c2', 'label_frac_c3', 'label_frac_c4','label_frac_c5', 'label_frac_c6', 'label_frac_c7']] = 0
df[['label_vert_c1', 'label_vert_c2', 'label_vert_c3', 'label_vert_c4','label_vert_c5', 'label_vert_c6', 'label_vert_c7']] = 0

In [15]:
df.shape

(1662687, 17)

In [None]:
df.head()

In [17]:
len(df)

1662687

In [18]:
def load_seg_model(cfg_name, model_name):
    cfg = importlib.import_module(cfg_name)
    importlib.reload(cfg)
    cfg = cfg.cfg
    #print(cfg)
    print(cfg.model, cfg.dataset, cfg.backbone, cfg.image_width, cfg.image_height)
    
    cfg.mixed_precision = False

    ds = importlib.import_module(cfg.dataset)
    importlib.reload(ds)
    CustomDataset = ds.CustomDataset
    # collate_fn = importlib.import_module(cfg.dataset).collate_fn
    batch_to_device = ds.batch_to_device

    cfg.post_process_pipeline = importlib.import_module(cfg.post_process_pipeline).post_process_pipeline

    m = importlib.import_module(cfg.model)
    importlib.reload(m)
    Net = m.Net

    # test settings
    cfg.data_folder = IMAGE_DATA_FOLDER
    cfg.test_data_folder = IMAGE_DATA_FOLDER
    cfg.data_dir = PROJECT_FOLDER
    cfg.pretrained = False
    cfg.device="cuda" if torch.cuda.is_available() else "cpu"
    cfg.return_logits = True

    cfg.calc_loss = False

    state_dicts = []
    for filepath in glob.iglob(f'./{model_name}/*.pth'):

        state_dicts.append(filepath)
        break
    print(state_dicts)

    nets = []
    for i in range(len(state_dicts)):
        d = torch.load(state_dicts[i], map_location="cpu")['model']
        new_d = {}
        for k,v in d.items():
            new_d[k.replace("module.", "")] = v
        sd = new_d

        net = Net(cfg).eval().to(cfg.device)
        net.load_state_dict(sd)

        nets.append(net)
    
    print("-------------")
    return nets, cfg, CustomDataset, batch_to_device

In [19]:
def load_model(cfg_name, model_name):
    cfg = importlib.import_module(cfg_name)
    importlib.reload(cfg)
    cfg = cfg.cfg
    print(cfg.model, cfg.dataset, cfg.backbone, cfg.image_width, cfg.image_height, cfg.frames_step_size)
    
    cfg.mixed_precision = False

    ds = importlib.import_module(cfg.dataset)
    importlib.reload(ds)
    CustomDataset = ds.CustomDataset
    batch_to_device = ds.batch_to_device

    cfg.post_process_pipeline = importlib.import_module(cfg.post_process_pipeline).post_process_pipeline

    m = importlib.import_module(cfg.model)
    importlib.reload(m)
    Net = m.Net

    # test settings
    cfg.data_folder = IMAGE_DATA_FOLDER
    cfg.test_data_folder = IMAGE_DATA_FOLDER
    cfg.data_dir = PROJECT_FOLDER
    cfg.pretrained = False
    cfg.device="cuda" if torch.cuda.is_available() else "cpu"
    cfg.box_3d = 'crop_box_3d.csv'
    cfg.return_logits = True
    cfg.calc_loss = False

    state_dicts = []
    for filepath in glob.iglob(f'./{model_name}/*.pth'):

        state_dicts.append(filepath)
        break
    print(state_dicts)

    nets = []
    for i in range(len(state_dicts)):
        d = torch.load(state_dicts[i], map_location="cpu")['model']
        new_d = {}
        for k,v in d.items():
            new_d[k.replace("module.", "")] = v
        sd = new_d

        net = Net(cfg).eval().to(cfg.device)
        net.load_state_dict(sd)

        nets.append(net)
    
    print("-------------")
    return nets, cfg, CustomDataset, batch_to_device

In [None]:
net, cfg, CustomDataset, batch_to_device = load_seg_model('stage1_S1B','rsna2022-stage1-s1b')

In [21]:
df[['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7']] = 0 

In [22]:
cfg.batch_size = 64
cfg.cache_size = 200
cfg.cache_disk = ""

In [23]:
from torchvision.ops import masks_to_boxes
import pydicom

In [None]:
boxes = []
with torch.inference_mode():
    
    test_ds = CustomDataset(df, cfg, cfg.val_aug, mode="test")
    test_dl = DataLoader(test_ds, shuffle=False, batch_size = cfg.batch_size, num_workers = 2)
    
    for batch in tqdm(test_dl):
        batch = batch_to_device(batch, cfg.device)
        out = net[0](batch)
        pred = (out['logits'].sigmoid().max(1)[0] > 0.5).long()
        box = torch.zeros((pred.shape[0],4))
        
        not_empty = pred.sum((1,2))>10
        b = masks_to_boxes(pred[not_empty])
        box[not_empty] = b.cpu()
        boxes += [box]
boxes = torch.cat(boxes)

 73%|███████▎  | 19081/25980 [2:22:04<46:17,  2.48it/s]  

In [None]:
box_preds = df[['StudyInstanceUID']].copy()
box_preds[['x1','y1','x2','y2']] = boxes.numpy()

In [None]:
#need original img shape to scale boxes to fit original image shape

def get_dicom_meta(path):
    """
    This supports loading both regular and compressed JPEG images. 
    See the first sell with `pip install` commands for the necessary dependencies
    """
    img=pydicom.dcmread(path)
    img.PhotometricInterpretation = 'YBR_FULL'
    shape = img.pixel_array.shape
    
    return shape

meta = df[['Image','StudyInstanceUID']].drop_duplicates('StudyInstanceUID')
img_fns = [fn for fn in meta['Image'].values]
shapes = [get_dicom_meta(p) for p in img_fns]
meta[['ImageHeight','ImageWidth']] = shapes

scales = meta[['StudyInstanceUID','ImageHeight','ImageWidth']].drop_duplicates().set_index('StudyInstanceUID').to_dict()

In [None]:
image_width = 320
image_height = 320

image_width_orig = 360
image_height_orig = 360

pad_ = (image_height_orig - image_height) // 2

def get_box(study_id):
    df = box_preds[box_preds['StudyInstanceUID']==study_id].copy()
    raw_boxes = df[['x1','y1','x2','y2']].values
    df['area'] = (raw_boxes[:,2]-raw_boxes[:,0])*(raw_boxes[:,3]-raw_boxes[:,1])
    raw_boxes2 = raw_boxes[((raw_boxes[:,2]-raw_boxes[:,0])*(raw_boxes[:,3]-raw_boxes[:,1]))>9]
    raw_boxes2 = raw_boxes2 + pad_
    
    x_scale = scales['ImageWidth'][study_id] / image_width_orig
    y_scale = scales['ImageHeight'][study_id] / image_height_orig
    
    try:
        x1y1 = np.quantile(raw_boxes2[:,:2],0.05,axis=0)
        x2y2 = np.quantile(raw_boxes2[:,2:],0.95,axis=0)
        z1 = np.quantile(np.where(df['area'].values > 0)[0],0.05)
        z2 = np.quantile(np.where(df['area'].values > 0)[0],0.95)

        box = np.array([x_scale*x1y1[0],x_scale*x2y2[0],y_scale*x1y1[1],y_scale*x2y2[1],z1,z2])
    except:
        print(study_id)
        box = np.array([0,0,0,0,0,0])
    return box

In [None]:
boxes = [get_box(study_id) for study_id in tqdm(meta['StudyInstanceUID'].values)]
meta[['x1','x2','y1','y2','z1','z2']] = boxes
meta.to_csv('crop_box_3d.csv',index=False)

In [None]:
meta = pd.read_csv('crop_box_3d.csv')

In [None]:
nets = []
net, cfg, CustomDataset, batch_to_device = load_model("stage2_cfg_ch_11_fix", "rsna2022-stage2-cfg-ch-11")
nets += net

In [None]:
cfg.batch_size = 64
cfg.cache_size = 200
cfg.cache_disk = ""

with torch.inference_mode():

    test_ds = CustomDataset(df, cfg, cfg.val_aug, mode="test")
    test_dl = DataLoader(test_ds, shuffle=False, batch_size = cfg.batch_size, num_workers = 2)

    fold_preds0 = [[] for i in range(len(nets))]
    for batch in tqdm(test_dl):
        batch = batch_to_device(batch, cfg.device)
        for i, net in enumerate(nets):
            logits = net(batch)['logits'].float().detach().cpu().numpy()

            fold_preds0[i] += [logits]

In [None]:
fold_preds0 = [np.concatenate(p, axis=0) for p in fold_preds0]

In [None]:
len(fold_preds0[0])

In [None]:
nets = []
net, cfg, CustomDataset, batch_to_device = load_model("stage2_cfg_ps_wd_29_val", "stage2-cfg-ps-wd-29-ff")
nets += net
net, cfg, CustomDataset, batch_to_device = load_model("stage2_cfg_ps_wd_30_val", "stage2-cfg-ps-wd-30-ff")
nets += net
net, cfg, CustomDataset, batch_to_device = load_model("stage2_cfg_ps_wd_37_val", "stage2-cfg-ps-wd-37-ff")
nets += net
net, cfg, CustomDataset, batch_to_device = load_model("stage2_cfg_ps_wd_40_val", "stage2-cfg-ps-wd-40-ff")
nets += net

In [None]:
len(nets)

In [None]:
cfg.batch_size = 64
cfg.cache_size = 200
cfg.cache_disk = ""

with torch.inference_mode():

    test_ds = CustomDataset(df, cfg, cfg.val_aug, mode="test")
    test_dl = DataLoader(test_ds, shuffle=False, batch_size = cfg.batch_size, num_workers = 2)

    fold_preds = [[] for i in range(len(nets))]
    for batch in tqdm(test_dl):
        batch = batch_to_device(batch, cfg.device)
        for i, net in enumerate(nets):
            logits = net(batch)['logits'].float().detach().cpu().numpy()

            fold_preds[i] += [logits]

In [None]:
assert (len(nets) == len(fold_preds))

In [None]:
fold_preds = [np.concatenate(p, axis=0) for p in fold_preds]

In [None]:
assert (len(fold_preds[0]) == len(df))

In [None]:
fold_preds = fold_preds0 + fold_preds0 + fold_preds
len(fold_preds)

In [None]:
preds = expit(np.mean(fold_preds, axis=0))

In [None]:
def post_process_pipeline(cfg, val_data, val_df):

    preds = val_data
    
    pred_cols = [
        "C1",
        "C2",
        "C3",
        "C4",
        "C5",
        "C6",
        "C7",
    ]

    val_df[
        pred_cols
    ] = preds

    window_size = 15

    uids = val_df['StudyInstanceUID'].unique()

    for c in  [
        pred_cols
    ]:
        val_df[c] = val_df.groupby("StudyInstanceUID")[c].rolling(window_size).mean().reset_index(0,drop=True)

    
        
    preds = (
        val_df.groupby("StudyInstanceUID")[
            pred_cols
        ]
        .max()
        .reset_index()
    )
    preds["patient_overall"] = 1 - (
        (1 - preds["C1"])
        * (1 - preds["C2"])
        * (1 - preds["C3"])
        * (1 - preds["C4"])
        * (1 - preds["C5"])
        * (1 - preds["C6"])
        * (1 - preds["C7"])
    )
    
    pred_cols += ["patient_overall"]

    for vert in range(8):
        preds[pred_cols].values[:, vert] = np.clip((preds[pred_cols].values[:, vert] / preds[pred_cols].values[:, vert].mean())* 2* means[vert]/ (1 + means[vert]),
            0.01,
            0.99,
        )

    return preds

In [None]:
preds_pp = post_process_pipeline(cfg, preds, df)

In [None]:
preds_pp.to_csv("7th_base_pred.csv")

In [None]:
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
import torch
import collections

from dataset import RSNAStackerDataset

pred_cols = [
    "pred_frac_c1",
    "pred_frac_c2",
    "pred_frac_c3",
    "pred_frac_c4",
    "pred_frac_c5",
    "pred_frac_c6",
    "pred_frac_c7",
]
label_cols = [
    "label_frac_c1",
    "label_frac_c2",
    "label_frac_c3",
    "label_frac_c4",
    "label_frac_c5",
    "label_frac_c6",
    "label_frac_c7",
]

class RSNAStackerModel(nn.Module):
    def __init__(self, n_features):
        super(RSNAStackerModel, self).__init__()
        
        self.sizes = [256, 128, 64]
        
        self.features = nn.Sequential(
            nn.Linear(n_features, self.sizes[0]),
            nn.PReLU(),
            nn.Linear(self.sizes[0], self.sizes[1]),
            nn.PReLU(),
            nn.Linear(self.sizes[1], self.sizes[2]),
            nn.PReLU(),
            nn.BatchNorm1d(self.sizes[-1]),
            nn.Dropout(0.2)
        )
        self.head = nn.Linear(self.sizes[-1], 8)

    def forward(self, x, y):
        
        x = self.features(x)
        x = self.head(x)
        
        
        output = {}
        
        output["logits"] = x

        return output

def run_nn_stacker(exp_name, df, BS=64):


    ds = RSNAStackerDataset(df.iloc[:].copy(), mode="test")
    
    checkpoints = glob.glob(f"./{exp_name}/*.pth")
    
    preds_all = []
    for checkpoint in checkpoints:
        print(f"running model {checkpoint}")
        
        model = RSNAStackerModel(n_features=ds.X.shape[1]).to("cpu").eval()
    
        model_weights = torch.load(checkpoint, map_location="cpu")

        model.load_state_dict(collections.OrderedDict(model_weights), strict=True)
        
        del model_weights
    
        batch_size = BS
        dl = DataLoader(ds, shuffle=False, batch_size = batch_size, num_workers = 2)

        with torch.no_grad():
            preds = []
            for batch in tqdm(dl):

                data = [x.to("cpu") for x in batch]
                inputs, target = data
                out = model(inputs, target)
                preds.append(out["logits"].float().sigmoid().detach().cpu().numpy())

        preds_all.append(np.concatenate(preds, axis=0))
        
        del model
        del dl
        
    del ds
    
    
    preds = np.mean(preds_all, axis=0)
    
    return preds

df[pred_cols] = preds

df[label_cols] = 0

nn_stacker_preds = run_nn_stacker("./final-nn-v0-ff", df, BS=64)

In [None]:
nn_stacker_preds.shape

In [None]:
preds = preds_pp.copy()

pred_cols = [
    "C1",
    "C2",
    "C3",
    "C4",
    "C5",
    "C6",
    "C7",
    "patient_overall"
]

preds[pred_cols] = 0.7*nn_stacker_preds + 0.3*preds[pred_cols]

In [None]:
preds.head()

In [None]:
row_ids = []
fractured = []
for idx,row in preds.iterrows():
    for c in pred_cols:
        row_ids.append(row.StudyInstanceUID+"_"+c)
        fractured.append(row[c])
        
sub_df = pd.DataFrame({
    "row_id": row_ids,
    "fractured": fractured
})

In [None]:
sub_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
sub_df