# Imports 

Based on kernel
https://www.kaggle.com/ekhtiar/finding-pneumo-part-1-eda-and-unet 

In [None]:
import torch # base
import numpy as np
from pathlib import Path # look in folders
import pandas as pd
import pydicom # open dicom images
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import pytorch_tools as pt # core functionality
from pytorch_tools.utils.rle import rle_to_string, rle_decode, rle_encode
import utils # competition specific functions
# Some notebooks magic
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
## Initial setup
# !pip install kaggle
# !kaggle datasets download -d jesperdramsch/siim-acr-pneumothorax-segmentation-data
# !mkdir data
# !cp ./siim-acr-pneumothorax-segmentation-data.zip ./data/
## Extract .zip into data folder

In [None]:
# Data
data = Path("./data")
train_rle_path = data / "train-rle.csv"
train_data_folder = data / "dicom-images-train"
test_data_folder = data / "dicom-images-test"
train_files_path = [x.as_posix() for x in sorted(train_data_folder.glob("./*/*/*.dcm"))]
test_files_path = [x.as_posix() for x in sorted(test_data_folder.glob("./*/*/*.dcm"))]
print(f"Train dataset length {len(train_files_path)}, test dataset length {len(test_files_path)}")

In [None]:
# load rles
rles = np.genfromtxt(train_rle_path, delimiter=', ', dtype=str)[1:]
print(len(rles)) ## we have more rles than images. lets merge them

# new_rles = []
# for un_idx in tqdm(np.unique(rles[:, 0])):
#     idx_rles = rles[rles[:,0] == un_idx][:, 1]
#     masks = [rle_decode(rle, (1024,1024), relative=True) for rle in idx_rles]
#     mask = np.clip(np.sum(masks, axis=0), 0,1)
#     new_rle = rle_to_string(rle_encode(mask))
#     new_rle = new_rle if new_rle else '-1'
#     new_rles.append(new_rle)

train_rle_filtered_path = data / "train-rle-filtered.csv"
# with open(train_rle_filtered_path, 'w') as f:
#     lines = [','.join([idx, rle]) for idx, rle in zip(np.unique(rles[:, 0]), new_rles)]
#     result = '\n'.join(lines)
#     f.write(result)

In [None]:
rles_df = pd.read_csv(train_rle_filtered_path)
# the second column has a space at the start, so manually giving column name
rles_df.columns = ['ImageId', 'EncodedPixels']
print(len(rles_df)) # Now we have less rles than images, it's OK (see kaggle discussions)
# TODO filter those images?

In [None]:
# parse train DICOM dataset
train_metadata_df = pd.DataFrame()
train_metadata_list = []
for file_path in tqdm(train_files_path):
    dicom_data = pydicom.dcmread(file_path)
    train_metadata = utils.dicom_to_dict(dicom_data, file_path, rles_df)
    train_metadata_list.append(train_metadata)
train_metadata_df = pd.DataFrame(train_metadata_list)

test_metadata_list = []
for file_path in tqdm(test_files_path):
    dicom_data = pydicom.dcmread(file_path)
    test_metadata = utils.dicom_to_dict(dicom_data, file_path, rles_df, encoded_pixels=False)
    test_metadata_list.append(test_metadata)
test_metadata_df = pd.DataFrame(test_metadata_list)

In [None]:
num_img = 3
subplot_count = 0
fig, ax = plt.subplots(nrows=1, sharey=True, ncols=num_img, figsize=(num_img*12,12))
for index, row in train_metadata_df.sample(n=num_img).iterrows():
    dataset = pydicom.dcmread(row['file_path'])
    ax[subplot_count].imshow(dataset.pixel_array, cmap=plt.cm.bone)
    # label the x-ray with information about the patient
    ax[subplot_count].text(0,0,'Age:{}, Sex: {}, Pneumothorax: {}'.format(row['patient_age'],row['patient_sex'],row['has_pneumothorax']),
                           size=26,color='white', backgroundcolor='black')
    subplot_count += 1

In [None]:
# lets take 2 random samples of x-rays with 
train_metadata_sample = train_metadata_df[train_metadata_df['has_pneumothorax']==1].sample(n=2)
# plot ten xrays with and without mask
for index, row in train_metadata_sample.iterrows():
    file_path = row['file_path']
    mask_encoded_list = row['encoded_pixels_list']
    print('image id: ' + row['id'])
    utils.plot_with_mask_and_bbox(file_path, mask_encoded_list)

In [None]:
train_metadata_sample = train_metadata_df[train_metadata_df['has_pneumothorax']==1].sample(n=1)
index, row = next(train_metadata_sample.iterrows())
mask_encoded_list = row['encoded_pixels_list']