In [1]:
import os
import pandas as pd
from pandas import HDFStore
import IPython
from IPython.display import Image, display
import pyarrow
from multiprocessing.pool import ThreadPool as Pool

In [2]:
from common.constatns import data_location, vqa_specs_location, fn_meta, augmented_data_location
from common.utils import VerboseTimer
from common.functions import get_highlited_function_code, generate_image_augmentations,  get_image
from common.os_utils import File


In [3]:
print(f'loading from:\n{data_location}')
with VerboseTimer("Loading Data"):
    with HDFStore(data_location) as store:
         df_data = store['data']

df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head(2)

loading from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
Loading Data: 0:00:05.880345
Data length: 5913


Unnamed: 0,index,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding
0,0,rjv03401.jpg,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[3.8335671424865723, 0.9851416349411011, 0.60..."
1,1,AIAN-14-313-g002.jpg,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.9880439043045044, 0.907943844795227, -1.30..."


### For the augmaentation we will use the following code:

In [4]:
code = get_highlited_function_code(generate_image_augmentations,remove_comments=False)
IPython.display.display(code)  

In [5]:
df_train = df_data[df_data.group == 'train']

image_paths = df_train.path.drop_duplicates()
print(len(image_paths))

def get_file_info(fn):
        image_folder, full_file_name = os.path.split(fn)
        file_name, ext = full_file_name.split('.')[-2:]        
        output_dir = os.path.join(image_folder,'augmentations',full_file_name+'\\')
        return (fn, file_name, ext, output_dir)

images_info = [get_file_info(p) for p in image_paths]        
non_existing_paths = [(fn, file_name, ext, output_dir) for (fn, file_name, ext, output_dir) in images_info if not os.path.isdir(output_dir)]
non_existing_paths = [(i, fn, file_name, ext, output_dir) for i, (fn, file_name, ext, output_dir) in enumerate(non_existing_paths)]


print(f'Generating augmentations for {len(non_existing_paths)} images')


def augments_single_image(tpl_data)  :
    try:       
        (i, curr_image_path, file_name, ext, output_dir) = tpl_data
        msg = (f'Augmenting ({i+1}/{len(non_existing_paths)})\t"{file_name}" -> {output_dir}')  
        if i %100 == 0:
            print(msg)
        File.validate_dir_exists(output_dir)
        generate_image_augmentations(curr_image_path, output_dir)
        res = 1
    except Exception as e: 
        msg = str(e)
        res = 0
    return (res,msg)


try:
    # for tpl_data in non_existing_paths:
         #augments_single_image(tpl_data)
    pool = Pool(processes=8)
    inputs = non_existing_paths
    pool_res = pool.map(augments_single_image, inputs)
    pool.terminate()

except Exception as ex:
    print(f'Error:\n{str(ex)}')

2278
Generating augmentations for 2099 images
Augmenting (1/2099)	"JRMS-18-86-g001" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\JRMS-18-86-g001.jpg\


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Augmenting (201/2099)	"MEAJO-19-340-g003" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\MEAJO-19-340-g003.jpg\
Augmenting (401/2099)	"JBCP-6-40-g004" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\JBCP-6-40-g004.jpg\
Augmenting (101/2099)	"NAJMS-3-39-g001" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\NAJMS-3-39-g001.jpg\
Augmenting (301/2099)	"1477-7800-3-15-3" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\1477-7800-3-15-3.jpg\
Augmenting (501/2099)	"jkns-56-272-g004" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\jkns-56-272-g004.jpg\
Augmenting (801/2099)	"ymj-51-270-g002" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\ymj-51-270-g002.jpg\
Augmenting (601/2099)	"1748-717X-2-2-1" -> C:\Users\Public\Documents\Data\20

In [13]:
failes = [tpl[1] for tpl in pool_res if tpl[0]==0]
successes = [tpl[1] for tpl in pool_res if tpl[0]==1]


f_summary = '\n'.join(failes[:5])
s_summary = '\n'.join(successes[:5])
summary = f'success: {len(successes)}\n{s_summary}\n\nfailes: {len(failes)}\n{f_summary}'.strip()

print(summary)

success: 2099
Augmenting (1/2099)	"JRMS-18-86-g001" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\JRMS-18-86-g001.jpg\
Augmenting (2/2099)	"IJRI-18-230-g002" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\IJRI-18-230-g002.jpg\
Augmenting (3/2099)	"1757-1626-2-10-1" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\1757-1626-2-10-1.jpg\
Augmenting (4/2099)	"JNRP-3-84-g003" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\JNRP-3-84-g003.jpg\
Augmenting (5/2099)	"0392-100X-32-202-g001" -> C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\0392-100X-32-202-g001.jpg\

failes: 0


In [14]:

a = images_info[:1]
a = images_info
aug_dict = {image_path:output_dir for (image_path, file_name, ext, output_dir) in a}

curr_idx = df_train.tail(1).index[0] +1

df_augments = df_train.copy()
df_augments['augmentation'] = 0
df_augments['idx'] = 0

print(len(df_augments))
new_rows = []
with VerboseTimer("Collecting augmented rows"):
    for image_path, output_dir in aug_dict.items():
        #print(image_path)
        image_rows = df_augments[df_augments.path == image_path]
        for i_row, row in image_rows.iterrows():
            #print(i_row)
            augment_files = [os.path.join(output_dir, fn) for fn in sorted(os.listdir(output_dir))]

            for i_augment, augment_path in enumerate(augment_files):
                r = row.copy()
                r.path = augment_path            
#                 r.image = get_image(augment_path)
                r.augmentation = i_augment + 1 
                r.idx = curr_idx
                curr_idx+=1
                r.reset_index()
                new_rows.append(r)        


5413
Collecting augmented rows: 0:00:27.446532


In [15]:
with VerboseTimer("Creating rows dataframe"):
    df_augmented_rows = pd.DataFrame(new_rows)
    
df = pd.concat([df_train, df_augmented_rows])    
print(len(df))

df.head(1)

Creating rows dataframe: 0:00:01.877097
32421


Unnamed: 0,abdomen,answer,answer_embedding,augmentation,brain,group,hematoma,idx,image_name,imaging_device,index,liver,neck,original_answer,original_question,path,question,question_embedding,tumor
0,False,tumor at tail pancreas,"[[3.8335671424865723, 0.9851416349411011, 0.60...",,False,train,False,,rjv03401.jpg,mri,0,False,False,lesion at tail of pancreas,what does mri show?,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,"[[3.8335671424865723, 0.9851416349411011, 0.60...",True


## Giving a meaningful index across dataframes:

In [16]:
df = df.sort_values(['augmentation', 'idx'], ascending=[True, True])


In [17]:

len_df = len(df)
idxs = range(0, len_df)
len_idx = len(set(idxs))
assert  len_idx== len_df , f'length of indexes ({len_idx}) did not match length of dataframe ({len_df})'
df.idx = idxs

In [18]:
df.iloc[[0,1,-2,-1]]

Unnamed: 0,abdomen,answer,answer_embedding,augmentation,brain,group,hematoma,idx,image_name,imaging_device,index,liver,neck,original_answer,original_question,path,question,question_embedding,tumor
0,False,tumor at tail pancreas,"[[3.8335671424865723, 0.9851416349411011, 0.60...",1.0,False,train,False,0,rjv03401.jpg,mri,0,False,False,lesion at tail of pancreas,what does mri show?,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,"[[3.8335671424865723, 0.9851416349411011, 0.60...",True
2799,False,mri,"[[-1.8525879383087158, -1.3275012969970703, 0....",1.0,False,train,False,1,rjv03401.jpg,mri,2799,False,False,mri,what shows lesion at tail of pancreas?,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows tumor at tail pancreas?,"[[-1.8525879383087158, -1.3275012969970703, 0....",True
5411,False,cortical bone right mibular ramus,"[[-0.5690958499908447, 0.053942739963531494, -...",,False,train,False,32419,cro-0007-0732-g03.jpg,ct,5411,False,False,of the cortical bone in the right mandibular r...,what does sagittal ct show erosion of?,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does sagittal ct show erosion ?,"[[-0.5690958499908447, 0.053942739963531494, -...",False
5412,False,mri,"[[-1.8525879383087158, -1.3275012969970703, 0....",,False,train,False,32420,num-06-03-17022-g002.jpg,mri,5412,False,False,magnetic resonance imaging,what shows a larger mass arising from clitoris...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows a larger mass arisg from clitoris w...,"[[-1.8525879383087158, -1.3275012969970703, 0....",False


In [19]:
data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'

In [20]:
# # df.head(1)
# # len(new_rows)
# new_rows[1].augmentation
# df.columns
# aug_keys = df.augmentation.drop_duplicates().values

# aug_keys
df[['augmentation','idx']].iloc[[0,1,-2,-1]]

Unnamed: 0,augmentation,idx
0,1.0,0
2799,1.0,1
5411,,32419
5412,,32420


In [21]:
import numpy as np
aug_keys = [int(i) if not np.isnan(i) else 0 for i in df.augmentation.drop_duplicates().values]
set(aug_keys)

{0, 1, 2, 3, 4, 5}

In [22]:
 with HDFStore(data_location) as store:
        k = store.keys()
k        

['/augmentation_0',
 '/augmentation_1',
 '/augmentation_2',
 '/augmentation_3',
 '/augmentation_4',
 '/augmentation_5',
 '/data',
 '/index',
 '/light',
 '/test']

In [23]:

from collections import defaultdict
index_dict = defaultdict(lambda:[])

with VerboseTimer(f"Storing {len(aug_keys)} dataframes"):
    with HDFStore(data_location) as store:
        for aug_key in aug_keys:
            with VerboseTimer(f"Storing dataframe '{aug_key}'"):
                data = df[df.augmentation == aug_key]

                store_key = f'augmentation_{aug_key}'
                idxs = data.idx.values                                
                index_dict['idx'].extend(idxs)        
                
                paths = data.path.values                
                index_dict['paths'].extend(paths)                
                
                index_dict['image_path'].extend(paths)
                index_dict['augmentation_key'].extend([aug_key]*len(paths))
                index_dict['store_path'].extend([data_location]*len(paths))
                index_dict['store_key'].extend([store_key]*len(paths))
                store[store_key] = data
                
        index=pd.DataFrame(index_dict) 
        store['index'] = index

Storing dataframe '1': 0:00:04.916307
Storing dataframe '2': 0:00:05.158602
Storing dataframe '3': 0:00:05.146551
Storing dataframe '4': 0:00:05.577693
Storing dataframe '5': 0:00:05.190126
Storing dataframe '0': 0:00:00.024586
Storing 6 dataframes: 0:00:26.112746


### The results:

In [24]:
with HDFStore(data_location) as store:
    loaded_index = store['index']

print(f'image_path: {loaded_index.image_path[0]}')    
print(f'store_path: {loaded_index.store_path[0]}')    
print(f'augmentation_key: {loaded_index.augmentation_key[0]}')    
  
loaded_index.head(1)

image_path: C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\rjv03401.jpg\_0_1478.jpg
store_path: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
augmentation_key: 1


Unnamed: 0,augmentation_key,idx,image_path,paths,store_key,store_path
0,1,0,C:\Users\Public\Documents\Data\2018\VQAMed2018...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,augmentation_1,C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-ME...


In [25]:
with HDFStore(data_location) as store:
    print(list(store.keys()))

['/augmentation_0', '/augmentation_1', '/augmentation_2', '/augmentation_3', '/augmentation_4', '/augmentation_5', '/data', '/index', '/light', '/test']


In [26]:
with pd.HDFStore(data_location) as store:
    augmentation_1 = store['augmentation_1']
    augmentation_5 = store['augmentation_5']

In [27]:
v5 = min(augmentation_5.idx),max(augmentation_5.idx)
v1 = min(augmentation_1.idx),max(augmentation_1.idx)

print(v5)
print(v1)
len(augmentation_1)
augmentation_1.head(5).idx


(21617, 27007)
(0, 5412)


0       0
2799    1
2908    2
1       3
1691    4
Name: idx, dtype: int32

In [28]:
augmentation_5.tail(5).idx

5275    27003
5330    27004
5360    27005
5392    27006
5412    27007
Name: idx, dtype: int32