# Please Create Your Own Copy

In [1]:
import pandas as pd

# https://docs.python.org/3/library/os.path.html
import os
# https://scikit-image.org/docs/0.7.0/api/skimage.io.html
import skimage.io
# https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html#resize
import cv2

from PIL import Image

# https://github.com/albumentations-team/albumentations
#import albumentations

In [2]:
CONVERT = False
# Paths
BASE_PATH = '/project/data'

image_dir = f'{BASE_PATH}/train_images'
mask_dir = f'{BASE_PATH}/train_label_masks'

In [3]:
# Load train dataframes
train_df = pd.read_csv(f'{BASE_PATH}/train.csv')

In [4]:
# Unify the gleason score notations from two providers
gleasons = train_df.gleason_score.unique()
print(gleasons)
print(train_df[train_df.gleason_score=='negative'].isup_grade.unique())
# Change 'negative' to '0+0'
train_df['gleason_score'] = train_df['gleason_score'].apply(lambda score: '0+0' if score == 'negative' else score)

['0+0' '4+4' '3+3' '4+3' 'negative' '4+5' '3+4' '5+4' '5+5' '5+3' '3+5']
[0]


In [5]:
# Check & correct the gleason-ISUP mapping
for gleason_score in gleasons:
    print(f'gleason {gleason_score} maps to ISUP {train_df[train_df.gleason_score==gleason_score].isup_grade.unique()}')
mislabel = train_df[(train_df.gleason_score=='4+3') & (train_df.isup_grade==2)].index
print(f'Drop mislabeled entries {mislabel}')
train_df.drop(mislabel, inplace=True)

gleason 0+0 maps to ISUP [0]
gleason 4+4 maps to ISUP [4]
gleason 3+3 maps to ISUP [1]
gleason 4+3 maps to ISUP [3 2]
gleason negative maps to ISUP []
gleason 4+5 maps to ISUP [5]
gleason 3+4 maps to ISUP [2]
gleason 5+4 maps to ISUP [5]
gleason 5+5 maps to ISUP [5]
gleason 5+3 maps to ISUP [4]
gleason 3+5 maps to ISUP [4]
Drop mislabeled entries Int64Index([7273], dtype='int64')


In [6]:
# Number of each gleason_score
for gleason_score in gleasons:
    print(f'gleason {gleason_score} has {len(train_df[train_df.gleason_score==gleason_score])} samples')

# Random choose 200 images from each gleason score
# https://note.nkmk.me/en/python-pandas-sample/
gleason_dict = {}
for gleason_score in gleasons:
    sample_num = min(200, len(train_df[train_df.gleason_score == gleason_score]))
    gleason_dict[gleason_score] = train_df[train_df.gleason_score == gleason_score].sample(n=sample_num).index

gleason 0+0 has 2892 samples
gleason 4+4 has 1126 samples
gleason 3+3 has 2666 samples
gleason 4+3 has 1242 samples
gleason negative has 0 samples
gleason 4+5 has 849 samples
gleason 3+4 has 1342 samples
gleason 5+4 has 248 samples
gleason 5+5 has 127 samples
gleason 5+3 has 43 samples
gleason 3+5 has 80 samples


In [7]:
isup_grades = train_df.isup_grade.unique()
isup_df = {}
for grade in isup_grades:
    print(grade)
    isup_df[grade] = train_df[train_df.isup_grade==grade]
    
# https://pythonhealthcare.org/2018/12/22/112-splitting-data-set-into-training-and-test-sets-using-pandas-dataframes-methods/
isup_train = {}
isup_test = {}
for grade in isup_grades:
    df = isup_df[grade].copy()
    isup_train[grade] = df.sample(frac=0.8, random_state=10)
    isup_test[grade] = df.drop(isup_train[grade].index)

0
4
1
3
5
2


In [8]:
def png_convert(save_path, img_id):
    #img_id = img_df.image_id
    img_path = os.path.join(image_dir, f'{img_id}.tiff')
    img_data = skimage.io.MultiImage(img_path)
    #for img in img_sample:
    #    print(img.shape)
    if len(img_data) == 0:
        print(img_id)
        return
    img_png = cv2.resize(img_data[-1], (512, 512))
    cv2.imwrite(f'{save_path}/{img_id}.png', img_png)

In [9]:
if CONVERT:
    # Convert the train dataset
    # There exist an empty image inside train dataframes
    for grade in isup_grades:
        print(grade)
        for img_id in isup_train[grade].image_id:
            png_convert('yi_data/train_512_512_3', img_id)
    # Convert the train dataset
    for grade in isup_grades:
        print(grade)
        for img_id in isup_test[grade].image_id:
            png_convert('yi_data/test_512_512_3', img_id)

In [12]:
# Drop that empty iamge from training dataset
train_df[train_df.image_id == 'bbfc17c7ac58d38b5eaaf72c53ece10d'].index
isup_train[0] = isup_train[0].drop(7758)

if CONVERT:
    # Save split to CSV files
    my_train_df = []
    my_test_df = []
    for grade in isup_grades:
        my_train_df.append(isup_train[grade])
        my_test_df.append(isup_test[grade])
    #print(my_train_df)
    pd.concat(my_train_df).to_csv('yi_data/train_512.csv', index=False)
    pd.concat(my_test_df).to_csv('yi_data/test_512.csv', index=False)