In [1]:
import os
import pandas as pd
from pandas import HDFStore
import IPython
from IPython.display import Image, display
import pyarrow
from tqdm import tqdm
from multiprocessing.pool import ThreadPool as Pool
import logging
import pyarrow as pa
import pyarrow.parquet as pq

In [6]:
from common.utils import VerboseTimer
from common.functions import get_highlighted_function_code, generate_image_augmentations,  get_image
from common.os_utils import File
from common.settings import data_access
import vqa_logger 
logger = logging.getLogger(__name__)

In [9]:
df_data = data_access.load_processed_data()

loading from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
[2019-02-05 21:32:05][DEBUG] Loading Data: 0:00:10.463825
[2019-02-05 21:32:07][DEBUG] Converting to pandas: 0:00:02.232098


In [8]:
df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head(2)

Data length: 14792


Unnamed: 0,index,image_name,question,answer,path,processed_question,processed_answer,diagnosis,locations,imaging_device,answer_embedding,question_embedding,group
0,0,synpic41148.jpg,what kind of image is this?,cta - ct angiography,C:\Users\Public\Documents\Data\2019\train\Trai...,what kind of image is this?,cta - ct angiography,,,ct,"[-0.946086049079895, 0.675370454788208, 1.3840...","[-2.1590447425842285, 3.4943666458129883, 0.19...",train
1,1,synpic43984.jpg,is this a t1 weighted image?,no,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a t1 weighted image?,no,,,unknown,"[0.029011979699134827, 1.9719411134719849, 1.5...","[1.099464774131775, 0.1577463150024414, -2.948...",train


In [5]:
df_data.group.drop_duplicates()

0             train
12792    validation
Name: group, dtype: category
Categories (2, object): [train, validation]

### For the augmaentation we will use the following code:

In [6]:
code = get_highlighted_function_code(generate_image_augmentations,remove_comments=False)
IPython.display.display(code)  

In [7]:
df_train = df_data[df_data.group == 'train']

image_paths = df_train.path.drop_duplicates()
print(len(image_paths))

def get_file_info(fn):
        image_folder, full_file_name = os.path.split(fn)
        file_name, ext = full_file_name.split('.')[-2:]        
        output_dir = os.path.join(image_folder,'augmentations',full_file_name+'\\')
        return (fn, file_name, ext, output_dir)

images_info = [get_file_info(p) for p in image_paths]        
non_existing_paths = [(fn, file_name, ext, output_dir) for (fn, file_name, ext, output_dir) in images_info if not os.path.isdir(output_dir)]
non_existing_paths = [(i, fn, file_name, ext, output_dir) for i, (fn, file_name, ext, output_dir) in enumerate(non_existing_paths)]


print(f'Generating augmentations for {len(non_existing_paths)} images')


def augments_single_image(tpl_data)  :
    try:       
        (i, curr_image_path, file_name, ext, output_dir) = tpl_data
        msg = (f'Augmenting ({i+1}/{len(non_existing_paths)})\t"{file_name}" -> {output_dir}')  
        if i %100 == 0:
            print(msg)
        File.validate_dir_exists(output_dir)
        generate_image_augmentations(curr_image_path, output_dir)
        res = 1
    except Exception as e: 
        msg = str(e)
        res = 0
    return (res,msg)


try:
    # for tpl_data in non_existing_paths:
         #augments_single_image(tpl_data)
    pool = Pool(processes=8)
    inputs = non_existing_paths
    pool_res = pool.map(augments_single_image, inputs)
    pool.terminate()

except Exception as ex:
    print(f'Error:\n{str(ex)}')

3200
Generating augmentations for 0 images


In [8]:
failes = [tpl[1] for tpl in pool_res if tpl[0]==0]
successes = [tpl[1] for tpl in pool_res if tpl[0]==1]


f_summary = '\n'.join(failes[:5])
s_summary = '\n'.join(successes[:5])
summary = f'success: {len(successes)}\n{s_summary}\n\nfailes: {len(failes)}\n{f_summary}'.strip()

print(summary)

success: 0


failes: 0


In [9]:
# a = images_info[:1]
a = images_info
aug_dict = {image_path:output_dir for (image_path, file_name, ext, output_dir) in a}

curr_idx = df_train.tail(1).index[0] +1

df_augments = df_train.copy()
df_augments['augmentation'] = 0
df_augments['idx'] = 0

print(len(df_augments))
new_rows = []
with VerboseTimer("Collecting augmented rows"):
    pbar = tqdm(aug_dict.items())
    for image_path, output_dir in pbar:
        #print(image_path)
        image_rows = df_augments[df_augments.path == image_path]
        for i_row, row in image_rows.iterrows():
            #print(i_row)
            augment_files = [os.path.join(output_dir, fn) for fn in sorted(os.listdir(output_dir))]

            for i_augment, augment_path in enumerate(augment_files):
                r = row.copy()
                r.path = augment_path            
#                 r.image = get_image(augment_path)
                r.augmentation = i_augment + 1 
                r.idx = curr_idx
                curr_idx+=1
                r.reset_index()
                new_rows.append(r)        


12792


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3200/3200 [01:14<00:00, 42.90it/s]


[2019-02-04 21:54:08][DEBUG] Collecting augmented rows: 0:01:14.636945


In [10]:
with VerboseTimer("Creating rows dataframe"):
    df_augmented_rows = pd.DataFrame(new_rows)
    
df = pd.concat([df_train, df_augmented_rows])    
print(len(df))

df.head(1)

[2019-02-04 21:54:13][DEBUG] Creating rows dataframe: 0:00:04.531613
76748


Unnamed: 0,answer,answer_embedding,augmentation,diagnosis,group,idx,image_name,imaging_device,index,locations,path,processed_answer,processed_question,question,question_embedding
0,cta - ct angiography,"[-0.946086049079895, 0.675370454788208, 1.3840...",,,train,,synpic41148.jpg,ct,0,,C:\Users\Public\Documents\Data\2019\train\Trai...,cta - ct angiography,what kind of image is this?,what kind of image is this?,"[-2.1590447425842285, 3.4943666458129883, 0.19..."


## Giving a meaningful index across dataframes:

In [11]:
df = df.sort_values(['augmentation', 'idx'], ascending=[True, True])


In [12]:

len_df = len(df)
idxs = range(0, len_df)
len_idx = len(set(idxs))
assert  len_idx== len_df , f'length of indexes ({len_idx}) did not match length of dataframe ({len_df})'
df.idx = idxs

In [13]:
df.iloc[[0,1,-2,-1]]

Unnamed: 0,answer,answer_embedding,augmentation,diagnosis,group,idx,image_name,imaging_device,index,locations,path,processed_answer,processed_question,question,question_embedding
0,cta - ct angiography,"[-0.946086049079895, 0.675370454788208, 1.3840...",1.0,,train,0,synpic41148.jpg,ct,0,,C:\Users\Public\Documents\Data\2019\train\Trai...,cta - ct angiography,what kind of image is this?,what kind of image is this?,"[-2.1590447425842285, 3.4943666458129883, 0.19..."
3200,axial,"[-1.3220698833465576, -0.9305600523948669, 0.8...",1.0,,train,1,synpic41148.jpg,ct,3200,,C:\Users\Public\Documents\Data\2019\train\Trai...,axial,which plane is this image taken?,which plane is this image taken?,"[-2.426004648208618, 4.558772087097168, 0.1516..."
12790,yes,"[-2.3747644424438477, 1.0431363582611084, 1.62...",,,train,76746,synpic48036.jpg,ct,12790,,C:\Users\Public\Documents\Data\2019\train\Trai...,yes,is ct normal?,is the ct scan normal?,"[0.7937809228897095, 0.42329514026641846, -3.6..."
12791,no,"[0.029011979699134827, 1.9719411134719849, 1.5...",,,train,76747,synpic54897.jpg,mr,12791,,C:\Users\Public\Documents\Data\2019\train\Trai...,no,is there an abnormality in the mr?,is there an abnormality in the mri?,"[1.5513315200805664, -0.19218707084655762, -1...."


In [14]:
data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.parquet'

In [15]:
# # df.head(1)
# # len(new_rows)
# new_rows[1].augmentation
# df.columns
# aug_keys = df.augmentation.drop_duplicates().values

# aug_keys
df[['augmentation','idx']].iloc[[0,1,-2,-1]]

Unnamed: 0,augmentation,idx
0,1.0,0
3200,1.0,1
12790,,76746
12791,,76747


In [16]:
import numpy as np
aug_keys = [int(i) if not np.isnan(i) else 0 for i in df.augmentation.drop_duplicates().values]
set(aug_keys)

{0, 1, 2, 3, 4, 5}

In [17]:
#  with HDFStore(data_location) as store:
#         k = store.keys()
# k        
data_location
augmentation_index = 'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\augmentation_index.h5'
augmentation_index

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\augmentation_index.h5'

In [18]:
df.head()

Unnamed: 0,answer,answer_embedding,augmentation,diagnosis,group,idx,image_name,imaging_device,index,locations,path,processed_answer,processed_question,question,question_embedding
0,cta - ct angiography,"[-0.946086049079895, 0.675370454788208, 1.3840...",1.0,,train,0,synpic41148.jpg,ct,0,,C:\Users\Public\Documents\Data\2019\train\Trai...,cta - ct angiography,what kind of image is this?,what kind of image is this?,"[-2.1590447425842285, 3.4943666458129883, 0.19..."
3200,axial,"[-1.3220698833465576, -0.9305600523948669, 0.8...",1.0,,train,1,synpic41148.jpg,ct,3200,,C:\Users\Public\Documents\Data\2019\train\Trai...,axial,which plane is this image taken?,which plane is this image taken?,"[-2.426004648208618, 4.558772087097168, 0.1516..."
6400,"lung, mediastinum, pleura","[-0.2881927192211151, 1.420225977897644, 2.803...",1.0,lung,train,2,synpic41148.jpg,ct,6400,"lung, mediastinum, pleura",C:\Users\Public\Documents\Data\2019\train\Trai...,"lung, mediastinum, pleura",which organ is captured by this ct?,which organ is captured by this ct scan?,"[-2.0855326652526855, 4.293310642242432, 0.579..."
9600,cryptococcal pneumonia in an immunocompetent host,"[-1.6821398735046387, 0.3354760706424713, -1.6...",1.0,,train,3,synpic41148.jpg,ct,9600,,C:\Users\Public\Documents\Data\2019\train\Trai...,cryptococcal pneumonia in an immunocompetent host,what is abnormal ct?,what is abnormal in the ct scan?,"[-2.741878032684326, 1.224818229675293, -0.289..."
1,no,"[0.029011979699134827, 1.9719411134719849, 1.5...",1.0,,train,4,synpic43984.jpg,unknown,1,,C:\Users\Public\Documents\Data\2019\train\Trai...,no,is this a t1 weighted image?,is this a t1 weighted image?,"[1.099464774131775, 0.1577463150024414, -2.948..."


# TODO: save as parquet. 
#### partition by augmentation

In [19]:

from collections import defaultdict
index_dict = defaultdict(lambda:[])

with VerboseTimer(f"Storing {len(aug_keys)} dataframes"):
    with HDFStore(augmentation_index) as store:
        pbar = tqdm(aug_keys)
        for aug_key in pbar:
            with VerboseTimer(f"Storing dataframe '{aug_key}'"):
                data = df[df.augmentation == aug_key]

                store_key = f'augmentation_{aug_key}'
                idxs = data.idx.values                                
                index_dict['idx'].extend(idxs)        
                
                paths = data.path.values                
                index_dict['paths'].extend(paths)                
                
                index_dict['image_path'].extend(paths)
                index_dict['augmentation_key'].extend([aug_key]*len(paths))
                index_dict['store_path'].extend([augmentation_index]*len(paths))
                index_dict['store_key'].extend([store_key]*len(paths))
                store[store_key] = data
                
        index=pd.DataFrame(index_dict) 
        store['index'] = index

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['answer', 'answer_embedding', 'diagnosis', 'group', 'image_name', 'imaging_device', 'locations', 'path', 'processed_answer', 'processed_question', 'question', 'question_embedding']]

  exec(code_obj, self.user_global_ns, self.user_ns)


[2019-02-04 21:54:35][DEBUG] Storing dataframe '1': 0:00:22.259087
[2019-02-04 21:54:35][DEBUG] Storing 6 dataframes: 0:00:22.447768


OverflowError: Python int too large to convert to C long

### The results:

In [None]:
with HDFStore(augmentation_index) as store:
    loaded_index = store['index']

print(f'image_path: {loaded_index.image_path[0]}')    
print(f'store_path: {loaded_index.store_path[0]}')    
print(f'augmentation_key: {loaded_index.augmentation_key[0]}')    
  
loaded_index.head(1)

In [None]:
with HDFStore(augmentation_index) as store:
    print(list(store.keys()))

In [None]:
with pd.HDFStore(augmentation_index) as store:
    augmentation_1 = store['augmentation_1']
    augmentation_5 = store['augmentation_5']

In [None]:
v1 = min(augmentation_1.idx),max(augmentation_1.idx)
v5 = min(augmentation_5.idx),max(augmentation_5.idx)


print(v5)
print(v1)
len(augmentation_1)
augmentation_1.head(5).idx


In [None]:
augmentation_5.tail(5).idx