In [1]:
import cv2 as cv
import glob
import h5py
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import openslide
import os
import pandas as pd
import random
from PIL import Image
from sklearn.model_selection import train_test_split

In [2]:
PATH_TO_PROCESSED_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed"
PATH_TO_TMA_PATCHES = os.path.join(PATH_TO_PROCESSED_DATA, "tma_patches")
PATH_TO_DATA_SPLITS = os.path.join(PATH_TO_PROCESSED_DATA, "data_splits/transcription_splits/tma_patches")

PATH_TO_RAW_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/raw"
PATH_TO_TRAIN_TEST_SPLIT = os.path.join(PATH_TO_RAW_DATA, "train_test_split.csv")

PATH_TO_TRAIN_DATA = os.path.join(PATH_TO_DATA_SPLITS, "train.hdf5")
PATH_TO_TEST_DATA = os.path.join(PATH_TO_DATA_SPLITS, "test.hdf5")

In [3]:
data_split_df = pd.read_csv(PATH_TO_TRAIN_TEST_SPLIT, delimiter=',')
data_split_map = data_split_df.set_index('case')['split'].to_dict()

In [4]:
tma_hdf5_filenames = glob.glob(PATH_TO_TMA_PATCHES + "/tma*.hdf5")
tma_hdf5_filenames

['/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6b.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma1.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma5.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma4.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma8.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma2.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6a.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma3.hdf5']

In [5]:
def build_data_splits_from_hdf5_files(tma_hdf5_filenames):
    included_patient_ids = set()
    excluded_patient_ids = set()
    train_patient_ids = []
    test_patient_ids = []
    train_f = h5py.File(PATH_TO_TRAIN_DATA, "w")
    test_f = h5py.File(PATH_TO_TEST_DATA, "w")

    patient_ids = set()
    patient_id_repeats = {}

    for filename in tma_hdf5_filenames:
        print(filename)
        
        with h5py.File(filename, "r") as f:
            for patient_id in f.keys():
                data = f[patient_id]
                patient_id_key = patient_id.split("_")[0].replace(" ", "")[:5]
                if patient_id_key not in data_split_map:
                    excluded_patient_ids.add(patient_id_key)
                    continue

                included_patient_ids.add(patient_id_key)
                data_split = data_split_map[patient_id_key]
                if data_split == "train":
                    train_patient_ids.append(patient_id)
                    out_f = train_f
                else:
                    assert(data_split == "test")
                    test_patient_ids.append(patient_id)
                    out_f = test_f

                name = patient_id
                # Deal with duplicate patients
                if (patient_id not in patient_ids):
                    patient_id_repeats[patient_id] = 0
                patient_id_repeats[patient_id] += 1
                name += f"_v{patient_id_repeats[patient_id]}"

                dset = out_f.create_dataset(name, data=data, dtype='uint8', chunks=True)
                dset.attrs['tma_id'] = data.attrs["tma_id"]
                dset.attrs['patient_id'] = data.attrs["patient_id"]
                dset.attrs['who_diagnosis'] = data.attrs["who_diagnosis"]
                dset.attrs['clpa_diagnosis'] = data.attrs["clpa_diagnosis"]
                dset.attrs['label'] = data.attrs["label"]
                patient_ids.add(patient_id)
    #train_f.close()
    #test_f.close()
    return (included_patient_ids, excluded_patient_ids)

(included_patient_ids, excluded_patient_ids) = build_data_splits_from_hdf5_files(tma_hdf5_filenames)

# TODO(vishankar): Check for DUPLICATE patient_ids ACROSS TMA maps (because these are only labeled with patient ID).
# TODO(vishankar): There was a mistake in TMA6A: E0710B was incorrectly labeled as E0709B.

/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6b.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma1.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma5.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma4.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma8.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma2.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6a.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma3.hdf5


In [6]:
# Patient ids from train_test_split.csv that we weren't able to include in train/test.hdf5.
# These patient ids are missing inside tma*.hdf5.
print(sorted(set(data_split_map.keys()).difference(included_patient_ids)))

# Question: Why aren't all the patient IDS in List_for_Oscar in the included list?
# Answer: For E0477 and E0758, we couldn't extract any patches. For the majority of other patient ids, they 
# have the label: excluded and are "NOT ON TMA" or are part of TMA7 (which we don't have in our dataset).

['E0264', 'E0274', 'E0278', 'E0279', 'E0282', 'E0286', 'E0317', 'E0330', 'E0335', 'E0360', 'E0361', 'E0452', 'E0477', 'E0500', 'E0502', 'E0507', 'E0551', 'E0555', 'E0710', 'E0758', 'E0934', 'E0935', 'E0936', 'E0937', 'E0938', 'E0939', 'E0940', 'E0941', 'E0942', 'E0943', 'E0944', 'E0945', 'E0946', 'E0947', 'E0948', 'E0949', 'E0950', 'E0951', 'E0952', 'E0953', 'E0955', 'E0956', 'E0957', 'E0959', 'E0960', 'E0961', 'E0962', 'E0963', 'E0964', 'E0965', 'E0966']
