In [1]:
# %%capture
import IPython
import os
from pandas import HDFStore
import pyarrow as pa
import pyarrow.parquet as pq
import logging
from pathlib import Path

In [2]:
import vqa_logger 
logger = logging.getLogger(__name__)

In [3]:
from common.constatns import raw_data_location, data_location
from common.settings import get_nlp
from common.functions import get_highlighted_function_code, get_image,  get_size
from pre_processing.prepare_data import get_text_features, pre_process_raw_data
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[32m[2019-02-04 21:47:01][DEBUG][0m [32musing embedding vector: en_core_web_lg[0m
[32m[2019-02-04 21:47:02][DEBUG][0m [32mGot NLP engine (en_core_web_lg)[0m


#### Where get_nlp is defined as:

In [None]:
code = get_highlighted_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [None]:
raw_data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\raw_data.h5'

In [None]:
with HDFStore(raw_data_location) as store:
    image_name_question = store['data']

In [None]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,synpic41148,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...
1,synpic43984,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
2,synpic38930,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...
3,synpic52143,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
4,synpic20934,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...


##### This is just for performance and quick debug cycles! remove before actual trainining:

In [None]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [None]:
code = get_highlighted_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [None]:
code = get_highlighted_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [None]:
code = get_highlighted_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [None]:
from pre_processing.data_enrichment import enrich_data
from pre_processing.data_cleaning import clean_data

orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

Looking for word: arch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:03<00:00, 25.64it/s]
Looking for word: breast: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 33.64it/s]
Looking for word: mammograph: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 24.79it/s]
  0%|                                                                                                                                                                                                                | 0/3700 [00:00<?, ?it/s]

[32m[2019-02-04 21:47:16][INFO][0m consolidating image devices


image device:	mr         : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3700/3700 [00:53<00:00, 69.28it/s]




In [None]:
groups = image_name_question.groupby('group')
groups.describe()
image_name_question[['imaging_device','image_name']].groupby('imaging_device').describe()

Unnamed: 0_level_0,image_name,image_name,image_name,image_name
Unnamed: 0_level_1,count,unique,top,freq
imaging_device,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
angiogram,396,99,synpic59107,4
ct,4283,1071,synpic41341,4
mammograph,128,32,synpic20049,4
mr,5649,1413,synpic45877,4
unknown,1311,328,synpic43955,4
us,862,216,synpic24249,4
xr,2163,541,synpic50593,4


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [None]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [None]:
logger.debug('----===== Preproceccing train data =====----')
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[32m[2019-02-04 21:48:09][DEBUG][0m [32m----===== Preproceccing train data =====----[0m
[32m[2019-02-04 21:48:10][INFO][0m Getting answers embedding
[32m[2019-02-04 21:49:24][DEBUG][0m [32mAnswer Embedding: 0:01:14.546679[0m
[32m[2019-02-04 21:49:24][INFO][0m Getting questions embedding


In [None]:
image_name_question_processed.head()

In [None]:
image_name_question[image_name_question.image_name == 'synpic52143.jpg'].head()

#### Saving the data, so later on we don't need to compute it again

### TODO: need to add question classification taking in consideration 2019 data

In [None]:
def add_dataframe_to_data_set(df, location):
    table = pa.Table.from_pandas(df)

    pq.write_to_dataset(
        table,
        root_path=str(location),#'output.parquet',
        partition_cols=['group'],
    )
#train_df.to_parquet(fname='',engine='pyarrow',partition_cols=)

In [None]:
logger.debug("Saving the data")
item_to_save = image_name_question_processed
# item_to_save = image_name_question.head(10)

# remove if exists
try:
    os.remove(data_location)
except OSError:
    pass


train_df = image_name_question_processed[(image_name_question_processed.group == 'train') | (image_name_question_processed.group == 'validation')]
test_df = image_name_question_processed[image_name_question_processed.group == 'test']
light = image_name_question_processed[['image_name', 'question', 'answer', 'group', 'path', 'imaging_device']]




root = Path(data_location)
with VerboseTimer("Saving model training data"):
    add_dataframe_to_data_set(image_name_question_processed, root)
#     light.to_hdf(data_location, 'light', mode='w', data_columns=['image_name', 'imaging_device', 'path'], format='table')    
#     add_dataframe_to_data_set(train_df, root/'train')
#     add_dataframe_to_data_set(test_df, root/'test')
        
size = get_size(data_location)
logger.debug(f"training data's file size was: {size}")



In [None]:
print('Data saved at:')
f'{data_location}'