# SIIM Pneumothorax Pre-processing Steps

## Import Data

In [4]:
import pandas as pd

rle_csv = pd.read_csv('/home/faith/MVLM/siim-acr-pneumothorax/train-rle.csv')
rle_csv.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.5597.151787518...,-1
1,1.2.276.0.7230010.3.1.4.8323329.12515.15178752...,-1
2,1.2.276.0.7230010.3.1.4.8323329.4904.151787518...,175349 7 1013 12 1009 17 1005 19 1003 20 1002...
3,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,407576 2 1021 7 1015 10 1013 12 1011 14 1008 ...
4,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,252069 1 1021 3 1020 4 1018 5 1018 6 1016 7 1...


## Mask Functions

In [51]:
import numpy as np

def rle2mask(rle, width, height):
    mask= np.zeros(width* height)
    array = np.asarray([int(x) for x in rle.split()])
    starts = array[0::2]
    lengths = array[1::2]

    current_position = 0
    for index, start in enumerate(starts):
        current_position += start
        mask[current_position:current_position+lengths[index]] = 255
        current_position += lengths[index]

    return mask.reshape(width, height)

# def mask2rle(img, width, height):
#     rle = []
#     lastColor = 0
#     currentPixel = 0
#     runStart = -1
#     runLength = 0

#     for x in range(width):
#         for y in range(height):
#             currentColor = img[x][y]
#             if currentColor != lastColor:
#                 if currentColor == 255:
#                     runStart = currentPixel
#                     runLength = 1
#                 else:
#                     rle.append(str(runStart))
#                     rle.append(str(runLength))
#                     runStart = -1
#                     runLength = 0
#                     currentPixel = 0
#             elif runStart > -1:
#                 runLength += 1
#             lastColor = currentColor
#             currentPixel+=1

#     return " ".join(rle)

# More efficient implementation
def mask2rle(img):
    '''
    Efficient implementation of mask2rle, from @paulorzp
    --
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    Source: https://www.kaggle.com/xhlulu/efficient-mask2rle
    '''
    pixels = img.T.flatten()
    pixels = np.pad(pixels, ((1, 1), ))
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

## Split by ImageId

In [54]:
from tqdm import tqdm

tqdm.pandas()

def collate_masks(row):
    mask = np.zeros([1024, 1024])
    for rle in row[' EncodedPixels']:
        if rle != " -1":
            mask += rle2mask(rle, 1024, 1024) 
    
    mask = mask >= 1  # changes to binary mask
    new_rle = mask2rle(mask)
    
    return new_rle

combined_df = rle_csv.groupby("ImageId").progress_apply(lambda x: collate_masks(x)).to_frame(name='rle_masks').reset_index()
combined_df.head()

  0%|          | 0/10675 [00:00<?, ?it/s]

100%|██████████| 10675/10675 [01:07<00:00, 158.65it/s]


Unnamed: 0,ImageId,rle_masks
0,1.2.276.0.7230010.3.1.4.8323329.1000.151787516...,
1,1.2.276.0.7230010.3.1.4.8323329.10000.15178752...,
2,1.2.276.0.7230010.3.1.4.8323329.10001.15178752...,
3,1.2.276.0.7230010.3.1.4.8323329.10002.15178752...,
4,1.2.276.0.7230010.3.1.4.8323329.10003.15178752...,


In [55]:
# Check there are no longer ImageId duplicates in the dataset
combined_df[combined_df['ImageId'].duplicated()]

Unnamed: 0,ImageId,rle_masks


In [62]:
num_positive_samples = len(combined_df[combined_df['rle_masks'] != ""])
num_negative_samples = len(combined_df[combined_df['rle_masks'] == ""])

print("Number of positive samples: {}".format(num_positive_samples))
print("Number of negative samples: {}".format(num_negative_samples))
print("Positive to Negative sample proportion: {}".format(num_positive_samples/num_negative_samples))

Number of positive samples: 2379
Number of negative samples: 8296
Positive to Negative sample proportion: 0.2867647058823529


In [105]:
# Conduct a stratified split of 0.7:0.15:0.15 to preserve the label ratio
from sklearn.model_selection import StratifiedShuffleSplit

class_index = (combined_df['rle_masks'] != "").astype(int) # 1 - positive, 0 - negative
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=0)
train_index, valtest_index = next(sss.split(combined_df, class_index))
train_df = combined_df.iloc[train_index].copy()

valtest_df = combined_df.iloc[valtest_index]
class_index = (valtest_df['rle_masks'] != "").astype(int)
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=0)
val_index, test_index = next(sss.split(valtest_df, class_index))
val_df = valtest_df.iloc[val_index].copy()
test_df = valtest_df.iloc[test_index].copy()

In [106]:
print("Train length: {}".format(len(train_df)))
print("Valid length: {}".format(len(val_df)))
print("Test length: {}".format(len(test_df)))

Train length: 7472
Valid length: 1601
Test length: 1602


In [114]:
train_df['split'] = 'train'
val_df['split'] = 'valid'
test_df['split'] = 'test'

# Concatenate train, valid and test splits
final_df = pd.concat([train_df, val_df, test_df], axis=0)

# As per original convention, replace the rle_masks value of images with no masks with " -1"
final_df["rle_masks"] = final_df["rle_masks"].replace("", " -1")

# Rename rle_masks to " EncodedPixels"
final_df = final_df.rename(columns={"rle_masks": " EncodedPixels"}).reset_index(drop=True)
final_df.head()

Unnamed: 0,ImageId,EncodedPixels,split
0,1.2.276.0.7230010.3.1.4.8323329.3440.151787517...,-1,train
1,1.2.276.0.7230010.3.1.4.8323329.1195.151787516...,-1,train
2,1.2.276.0.7230010.3.1.4.8323329.1755.151787516...,23977 19 24998 25 26019 31 27038 37 28056 45 2...,train
3,1.2.276.0.7230010.3.1.4.8323329.13713.15178752...,-1,train
4,1.2.276.0.7230010.3.1.4.8323329.3901.151787518...,-1,train


## Run checks

In [115]:
# Description of final DataFrame
final_df.describe()

Unnamed: 0,ImageId,EncodedPixels,split
count,10675,10675,10675
unique,10675,2380,3
top,1.2.276.0.7230010.3.1.4.8323329.10353.15178752...,-1,train
freq,1,8296,7472


In [116]:
# Number of train, valid and test samples
final_df["split"].value_counts()

split
train    7472
test     1602
valid    1601
Name: count, dtype: int64

In [121]:
final_df.head()

Unnamed: 0,ImageId,EncodedPixels,split
0,1.2.276.0.7230010.3.1.4.8323329.3440.151787517...,-1,train
1,1.2.276.0.7230010.3.1.4.8323329.1195.151787516...,-1,train
2,1.2.276.0.7230010.3.1.4.8323329.1755.151787516...,23977 19 24998 25 26019 31 27038 37 28056 45 2...,train
3,1.2.276.0.7230010.3.1.4.8323329.13713.15178752...,-1,train
4,1.2.276.0.7230010.3.1.4.8323329.3901.151787518...,-1,train


In [130]:
# Number of positive/negative samples for each split
filtered_df = final_df.copy()
filtered_df['class'] = (filtered_df[' EncodedPixels'] != " -1").astype(int)
filtered_df[['split', 'class', 'ImageId']].groupby(['split', 'class']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ImageId
split,class,Unnamed: 2_level_1
test,0,1245
test,1,357
train,0,5807
train,1,1665
valid,0,1244
valid,1,357


In [102]:
# Verify that there is no data leakage from train set to other splits
any([train_df['ImageId'].isin(val_df['ImageId']).any(), \
     train_df['ImageId'].isin(test_df['ImageId']).any()])

False

In [100]:
# Verify that there is no data leakage from valid set to other splits
any([val_df['ImageId'].isin(train_df['ImageId']).any(), \
     val_df['ImageId'].isin(test_df['ImageId']).any()])

False

In [101]:
# Verify that there is no data leakage from test set to other splits
any([test_df['ImageId'].isin(train_df['ImageId']).any(), \
     test_df['ImageId'].isin(val_df['ImageId']).any()])

False