In [1]:
import numpy as np
import pandas as pd
import os
import json
import PIL
from PIL import Image, ImageSequence, ImageOps
import sys
from tqdm import trange


In [2]:
def make_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [3]:
def hashing_lookup(original_study_name, hashed_mrn_table):
    '''
    '11081934s1' etc
    '''
    
    mrn, study_idx = original_study_name.split('s')
    
    hashed_mrn = str(hashed_mrn_table[hashed_mrn_table['original_mrn']==int(mrn)].hashed_mrn.values[0])
    hashed_study_name = 's'.join([hashed_mrn, study_idx])
    
    return hashed_study_name

In [4]:
def resize_and_pad(tiff_image, resized_shape):
    
    this_tiff_image_original_height = tiff_image.size[1]
    this_tiff_image_original_width = tiff_image.size[0]
    
    if this_tiff_image_original_height > this_tiff_image_original_width:
        
        this_tiff_image_new_height = resized_shape
        this_tiff_image_new_width = int(resized_shape * (this_tiff_image_original_width/this_tiff_image_original_height))
        
        pad_along_width = resized_shape - this_tiff_image_new_width
        pad_along_height = 0
        
        pad_configuration = ((0, pad_along_height), (0, pad_along_width), (0,0)) 
    
    elif this_tiff_image_original_height <= this_tiff_image_original_width:
        
        this_tiff_image_new_width = resized_shape
        this_tiff_image_new_height = int(resized_shape * (this_tiff_image_original_height/this_tiff_image_original_width))
        
        pad_along_height = resized_shape - this_tiff_image_new_height
        pad_along_width = 0
        
        pad_configuration = ((0, pad_along_height), (0, pad_along_width), (0,0))
    
    return this_tiff_image_new_height, this_tiff_image_new_width, pad_configuration

## Save as png

In [5]:
class_to_integer_mapping_dir="/cluster/tufts/hugheslab/zhuang12/Echo_ClinicalManualScript_1112/split_info"
resized_shape=112
ImageList_fullpaths = "/cluster/tufts/hugheslab/zhuang12/Echo_ClinicalManualScript_1112/split_info_regenerate_for_data_release_convenience_v2/seed0/shared_test_this_seed/test.csv"
result_save_dir = './sample_images'


In [6]:
#main
hashed_mrn_table = pd.read_csv('/cluster/tufts/hugheslab/zhuang12/JACC_DataRelease/20220412version/MRN_hashing_table_20220412.csv')

make_dir_if_not_exists(result_save_dir)

with open(os.path.join(class_to_integer_mapping_dir, 'view_class_to_integer_mapping.json')) as view_file:
        view_class_to_integer_mapping = json.load(view_file)
    
with open(os.path.join(class_to_integer_mapping_dir, 'diagnosis_class_to_integer_mapping.json')) as diagnosis_file:
    diagnosis_class_to_integer_mapping = json.load(diagnosis_file)
    
    
this_ImageList = None
ImageList_fullpaths = ImageList_fullpaths.split(',')
for ImageList_fullpath in ImageList_fullpaths: 
    print('Currently reading {}'.format(ImageList_fullpath))
    this_ImageList = pd.concat([this_ImageList, pd.read_csv(ImageList_fullpath)], ignore_index=True)


this_ImageList = this_ImageList.sort_values(by=['study_names'])
num_images_to_extract = this_ImageList.shape[0]
print('#images to extract: {}'.format(num_images_to_extract))

global_count = 0

#initialize the first current study
current_study = hashing_lookup(this_ImageList.iloc[0].study_names.split('_')[0], hashed_mrn_table)
print('initial current_study: {}'.format(current_study))
current_study_count = 0

for i in trange(num_images_to_extract):
    if global_count==1:
        break
    
    print('current_study: {}'.format(current_study))
    this_tiff_fullpath = this_ImageList.iloc[i].tiff_paths
    this_tiff_viewlabel = this_ImageList.iloc[i].view_labels
    this_tiff_diagnosislabel = this_ImageList.iloc[i].diagnosis_labels

    this_study = hashing_lookup(this_ImageList.iloc[i].study_names.split('_')[0], hashed_mrn_table)

    print('this_study: {}'.format(this_study))
    if this_study == current_study:
        current_study_count += 1
    else:
        current_study = this_study
        current_study_count = 1

    im = Image.open(this_tiff_fullpath)

    #the ImageList already dont contain broken tiff, so don't need to use exception
    for page, tiff_image in enumerate(ImageSequence.Iterator(im)):
        if page == 1: #only take the first frame
            break
            
    #convert to grayscale
    tiff_image = ImageOps.grayscale(tiff_image) #convert to gray scale
    #resize
    this_tiff_image_new_height, this_tiff_image_new_width, pad_configuration = resize_and_pad(tiff_image, resized_shape)

    tiff_image = tiff_image.resize((this_tiff_image_new_width, this_tiff_image_new_height))

    #expand from (H,W) to (H,W,1)
    tiff_image_array = np.expand_dims(np.array(tiff_image), axis=2)

    #pad
    tiff_image_array = np.pad(tiff_image_array, pad_width=pad_configuration, mode='constant', constant_values=0)        

    im_to_save = Image.fromarray(tiff_image_array.squeeze())
        
#     if os.path.exists("{}/{}_{}.png".format(result_save_dir, this_study, current_study_count-1)):
#         print("{}/{}_{}.png".format(result_save_dir, this_study, current_study_count-1))
#         raise NameError('file already exists')

    im_to_save.save("{}/{}_{}.png".format(result_save_dir, this_study, current_study_count-1))
    global_count +=1
    


  0%|          | 1/3602 [00:00<01:49, 32.86it/s]

Currently reading /cluster/tufts/hugheslab/zhuang12/Echo_ClinicalManualScript_1112/split_info_regenerate_for_data_release_convenience_v2/seed0/shared_test_this_seed/test.csv
#images to extract: 3602
initial current_study: 5269s1
current_study: 5269s1
this_study: 5269s1





In [7]:
tiff_image_array.shape

(112, 112, 1)

## Load back as np array

In [8]:
def LoadImage(file_path):
    im = PIL.Image.open(file_path)
    im = np.asarray(im)
    return im
    

In [9]:
tiff_image_array_loadback = LoadImage('./sample_images/5269s1_0.png')

In [11]:
tiff_image_array_loadback.shape

(112, 112)

## Check array equal

In [12]:
np.array_equal(tiff_image_array_loadback, tiff_image_array.squeeze())

True