In [1]:
import os
import pandas as pd
from pandas import HDFStore
import IPython
from IPython.display import Image, display
import pyarrow

In [2]:
from common.constatns import data_location, vqa_specs_location, fn_meta, augmented_data_location
from common.utils import VerboseTimer
from common.functions import get_highlited_function_code, generate_image_augmentations,  get_image
from common.os_utils import File


In [3]:
print(f'loading from:\n{data_location}')
with VerboseTimer("Loading Data"):
    with HDFStore(data_location) as store:
         df_data = store['data']
        
df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head(2)

loading from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
Loading Data: 0:00:06.273723
Data length: 5913


Unnamed: 0,image_name,question,answer,group,path,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding
0,rjv03401.jpg,what does MRI show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[-2.1287951469421387, 2.4069643020629883, 0.9..."
1,AIAN-14-313-g002.jpg,where does axial seCTion MRI abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.329662561416626, 1.4127026796340942, -3.38..."


### For the augmaentation we will use the following code:

In [4]:
code = get_highlited_function_code(generate_image_augmentations,remove_comments=False)
IPython.display.display(code)  

In [5]:
df_train = df_data[df_data.group == 'train']

image_paths = df_train.path.drop_duplicates()
print(len(image_paths))




def get_file_info(fn):
        image_folder, full_file_name = os.path.split(fn)
        file_name, ext = full_file_name.split('.')[-2:]        
        output_dir = os.path.join(image_folder,'augmentations',full_file_name+'\\')
        return (fn, file_name, ext, output_dir)
        
images_info = [get_file_info(p) for p in image_paths]        
non_existing_paths = [(fn, file_name, ext, output_dir) for (fn, file_name, ext, output_dir) in images_info if not os.path.isdir(output_dir)]



print(f'Generating augmentations for {len(non_existing_paths)} images')

non_existing_paths = non_existing_paths
for i,  (curr_image_path, file_name, ext, output_dir) in enumerate(non_existing_paths):
    print(f'Augmenting ({i+1}/{len(non_existing_paths)})\t"{file_name}" -> {output_dir}')    
    File.validate_dir_exists(output_dir)
    generate_image_augmentations(curr_image_path, output_dir)

2278
Generating augmentations for 0 images


In [6]:
a = images_info[:1]
a = images_info
aug_dict = {image_path:output_dir for (image_path, file_name, ext, output_dir) in a}

curr_idx = df_train.tail(1).index[0] +1

df_augments = df_train.copy()
df_augments['augmentation'] = 0
df_augments['idx'] = 0

print(len(df_augments))
new_rows = []
with VerboseTimer("Collecting augmented rows"):
    for image_path, output_dir in aug_dict.items():
        #print(image_path)
        image_rows = df_augments[df_augments.path == image_path]
        for i_row, row in image_rows.iterrows():
            #print(i_row)
            augment_files = [os.path.join(output_dir, fn) for fn in sorted(os.listdir(output_dir))]

            for i_augment, augment_path in enumerate(augment_files):
                r = row.copy()
                r.path = augment_path            
#                 r.image = get_image(augment_path)
                r.augmentation = i_augment + 1 
                r.idx = curr_idx
                curr_idx+=1
                r.reset_index()
                new_rows.append(r)        


5413
Collecting augmented rows: 0:01:42.233703


In [7]:
with VerboseTimer("Creating rows dataframe"):
    df_augmented_rows = pd.DataFrame(new_rows)
    
df = pd.concat([df_train, df_augmented_rows])    
print(len(df))

df.head(0)

Creating rows dataframe: 0:00:07.796528
113566


Unnamed: 0,abdomen,answer,answer_embedding,augmentation,brain,group,hematoma,idx,image_name,imaging_device,liver,neck,path,question,question_embedding,tumor


## Giving a meaningful index across dataframes:

In [8]:
df = df.sort_values(['augmentation', 'idx'], ascending=[True, True])


In [9]:

len_df = len(df)
idxs = range(0, len_df)
len_idx = len(set(idxs))
assert  len_idx== len_df , f'length of indexes ({len_idx}) did not match length of dataframe ({len_df})'
df.idx = idxs

In [10]:
df.iloc[[0,1,-2,-1]]

Unnamed: 0,abdomen,answer,answer_embedding,augmentation,brain,group,hematoma,idx,image_name,imaging_device,liver,neck,path,question,question_embedding,tumor
0,False,tumor at tail pancreas,"[[3.8335671424865723, 0.9851416349411011, 0.60...",1.0,False,train,False,0,rjv03401.jpg,mri,False,False,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does MRI show?,"[[-2.1287951469421387, 2.4069643020629883, 0.9...",True
2799,False,MRI,"[[-0.14483636617660522, 1.9622962474822998, 2....",1.0,False,train,False,1,rjv03401.jpg,mri,False,False,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows tumor at tail pancreas?,"[[-2.2932257652282715, 1.839470386505127, 0.53...",True
5411,False,cortical bone right mibular ramus,"[[-0.5690958499908447, 0.053942739963531494, -...",,False,train,False,113564,cro-0007-0732-g03.jpg,ct,False,False,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does sagittal CT show erosion ?,"[[-1.536467432975769, 2.7954959869384766, 0.49...",False
5412,False,MRI,"[[-0.14483636617660522, 1.9622962474822998, 2....",,False,train,False,113565,num-06-03-17022-g002.jpg,mri,False,False,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows a larger mass arisg from clitoris w...,"[[-2.839320182800293, 2.1729674339294434, 0.58...",False


In [11]:
data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'

In [12]:
# # df.head(1)
# # len(new_rows)
# new_rows[1].augmentation
# df.columns
# aug_keys = df.augmentation.drop_duplicates().values

# aug_keys
df[['augmentation','idx']].iloc[[0,1,-2,-1]]

Unnamed: 0,augmentation,idx
0,1.0,0
2799,1.0,1
5411,,113564
5412,,113565


In [13]:
import numpy as np
aug_keys = [int(i) if not np.isnan(i) else 0 for i in df.augmentation.drop_duplicates().values]
set(aug_keys)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

In [14]:
 with HDFStore(data_location) as store:
        k = store.keys()
k        

['/data', '/light', '/test']

In [15]:

from collections import defaultdict
index_dict = defaultdict(lambda:[])

with VerboseTimer(f"Storing {len(aug_keys)} dataframes"):
    with HDFStore(data_location) as store:
        for aug_key in aug_keys:
            with VerboseTimer(f"Storing dataframe '{aug_key}'"):
                data = df[df.augmentation == aug_key]

                store_key = f'augmentation_{aug_key}'
                idxs = data.idx.values                                
                index_dict['idx'].extend(idxs)        
                
                paths = data.path.values                
                index_dict['paths'].extend(paths)                
                
                index_dict['image_path'].extend(paths)
                index_dict['augmentation_key'].extend([aug_key]*len(paths))
                index_dict['store_path'].extend([data_location]*len(paths))
                index_dict['store_key'].extend([store_key]*len(paths))
                store[store_key] = data
                
        index=pd.DataFrame(index_dict) 
        store['index'] = index

Storing dataframe '1': 0:00:05.652354
Storing dataframe '2': 0:00:05.465061
Storing dataframe '3': 0:00:05.491244
Storing dataframe '4': 0:00:06.339285
Storing dataframe '5': 0:00:05.533040
Storing dataframe '6': 0:00:05.776421
Storing dataframe '7': 0:00:05.397454
Storing dataframe '8': 0:00:05.560201
Storing dataframe '9': 0:00:05.368780
Storing dataframe '10': 0:00:06.342468
Storing dataframe '11': 0:00:05.675203
Storing dataframe '12': 0:00:05.871817
Storing dataframe '13': 0:00:06.866912
Storing dataframe '14': 0:00:06.860370
Storing dataframe '15': 0:00:06.171339
Storing dataframe '16': 0:00:06.058223
Storing dataframe '17': 0:00:06.472753
Storing dataframe '18': 0:00:05.727371
Storing dataframe '19': 0:00:05.853733
Storing dataframe '20': 0:00:05.626999
Storing dataframe '0': 0:00:00.014829
Storing 21 dataframes: 0:01:58.410680


### The results:

In [16]:
with HDFStore(data_location) as store:
    loaded_index = store['index']

print(f'image_path: {loaded_index.image_path[0]}')    
print(f'store_path: {loaded_index.store_path[0]}')    
print(f'augmentation_key: {loaded_index.augmentation_key[0]}')    
  
loaded_index.head(1)

image_path: C:\Users\Public\Documents\Data\2018\VQAMed2018Train\VQAMed2018Train-images\augmentations\rjv03401.jpg\_0_1121.jpg
store_path: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
augmentation_key: 1


Unnamed: 0,augmentation_key,idx,image_path,paths,store_key,store_path
0,1,0,C:\Users\Public\Documents\Data\2018\VQAMed2018...,C:\Users\Public\Documents\Data\2018\VQAMed2018...,augmentation_1,C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-ME...


In [17]:
with HDFStore(data_location) as store:
    print(list(store.keys()))

['/augmentation_0', '/augmentation_1', '/augmentation_10', '/augmentation_11', '/augmentation_12', '/augmentation_13', '/augmentation_14', '/augmentation_15', '/augmentation_16', '/augmentation_17', '/augmentation_18', '/augmentation_19', '/augmentation_2', '/augmentation_20', '/augmentation_3', '/augmentation_4', '/augmentation_5', '/augmentation_6', '/augmentation_7', '/augmentation_8', '/augmentation_9', '/data', '/index', '/light', '/test']


In [18]:
with pd.HDFStore(data_location) as store:
    augmentation_1 = store['augmentation_1']
    augmentation_20 = store['augmentation_20']

Unnamed: 0,abdomen,answer,answer_embedding,augmentation,brain,group,hematoma,idx,image_name,imaging_device,liver,neck,path,question,question_embedding,tumor
5412,False,MRI,"[[-0.14483636617660522, 1.9622962474822998, 2....",1.0,False,train,False,5412,num-06-03-17022-g002.jpg,mri,False,False,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows a larger mass arisg from clitoris w...,"[[-2.839320182800293, 2.1729674339294434, 0.58...",False


In [19]:
v20 = min(augmentation_20.idx),max(augmentation_20.idx)
v1 = min(augmentation_1.idx),max(augmentation_1.idx)

print(v20)
print(v1)
len(augmentation_1)
augmentation_1.head(5).idx


(102845, 108152)
(0, 5412)


0       0
2799    1
2908    2
1       3
1691    4
Name: idx, dtype: int32

In [21]:
augmentation_20.tail(5).idx

5275    108148
5330    108149
5360    108150
5392    108151
5412    108152
Name: idx, dtype: int32