In [1]:
# %%capture
import IPython
import os
import numpy as np
import pandas as pd
from pandas import HDFStore
import spacy
from keras.utils import to_categorical
import cv2
from collections import defaultdict

from vqa_logger import logger
from common.os_utils import File

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from common.constatns import train_data, validation_data, data_location, raw_data_location
from common.settings import input_length, embedding_dim, image_size, seq_length, get_nlp
from common.functions import get_highlited_function_code, get_image, get_text_features, pre_process_raw_data, get_size
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [3]:
nlp = get_nlp()

[2019-01-03 23:03:17,675][DEBUG] using embedding vector: en_core_web_lg
[2019-01-03 23:03:18,407][DEBUG] Got NLP engine (en_core_web_lg)


#### Where get_nlp is defined as:

In [4]:
code = get_highlited_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [5]:
with HDFStore(raw_data_location) as store:
    image_name_question = store['data']

In [6]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,rjv03401,what does mri show?,lesion at tail of pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
1,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,in distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
2,wjem-11-76f3,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
3,ccr30002-0045-f3,what was normal?,blood supply to the brain,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
4,rjt01904,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...


##### This is just for performance and quick debug cycles! remove before actual trainining:

In [7]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [8]:
code = get_highlited_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [9]:
code = get_highlited_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [10]:
code = get_highlited_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [11]:
from common.functions import enrich_data, clean_data
orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

  0%|                                                                                         | 0/2866 [00:00<?, ?it/s]

[2019-01-03 23:03:19,772][INFO] consolidating image devices


image device:	mri: 100%|██████████████████████████████████████████████████████████| 2866/2866 [00:24<00:00, 116.03it/s]


In [12]:
image_name_question[image_name_question.image_name == '0392-100X-33-350-g002.jpg'].head()
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device
0,rjv03401,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri
1,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri
2,wjem-11-76f3,what do arrows denote noncontrast ct pelvis?,complex fluid collection with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,False,True,False,False,False,False,ct
3,ccr30002-0045-f3,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what was normal?,blood supply to the brain,False,False,False,False,False,False,ct
4,rjt01904,what shows evidence a contaed rupture?,repeat ct abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,False,False,False,True,False,False,ct


In [13]:
image_name_question.groupby('group').describe()
image_name_question[['imaging_device','image_name']].groupby('imaging_device').describe()

Unnamed: 0_level_0,image_name,image_name,image_name,image_name
Unnamed: 0_level_1,count,unique,top,freq
imaging_device,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ct,3721,1686,1477-7819-6-60-1,7
mra,12,4,SNI-4-153-g003,3
mri,2153,933,JPN-6-94-g002,6
unknown,527,243,TOORTHJ-9-367_F2,6


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [14]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [15]:
logger.debug('----===== Preproceccing train data =====----')
image_locations = train_data.images_path
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[2019-01-03 23:03:44,667][DEBUG] ----===== Preproceccing train data =====----
[2019-01-03 23:03:44,813][DEBUG] Getting answers embedding
Answer Embedding: 0:00:33.517518
[2019-01-03 23:04:18,323][DEBUG] Getting questions embedding
Question Embedding: 0:00:32.964101
[2019-01-03 23:04:51,295][DEBUG] Done
Pre processing training data: 0:01:06.634653


In [16]:
image_name_question_processed.head()

Unnamed: 0,index,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding
0,0,rjv03401.jpg,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[3.8335671424865723, 0.9851416349411011, 0.60..."
1,1,AIAN-14-313-g002.jpg,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.9880439043045044, 0.907943844795227, -1.30..."
2,2,wjem-11-76f3.jpg,what do arrows denote noncontrast ct pelvis?,complex fluid collection with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,False,True,False,False,False,False,ct,"[[0.16135236620903015, -1.7424618005752563, -1...","[[0.16135236620903015, -1.7424618005752563, -1..."
3,3,ccr30002-0045-f3.jpg,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what was normal?,blood supply to the brain,False,False,False,False,False,False,ct,"[[1.421677589416504, 1.1374449729919434, 0.465...","[[1.421677589416504, 1.1374449729919434, 0.465..."
4,4,rjt01904.jpg,what shows evidence a contaed rupture?,repeat ct abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,False,False,False,True,False,False,ct,"[[3.0298147201538086, -0.026108086109161377, 1...","[[3.0298147201538086, -0.026108086109161377, 1..."


#### Saving the data, so later on we don't need to compute it again

In [17]:
image_name_question_processed.imaging_device.drop_duplicates()
image_name_question_processed[image_name_question_processed.imaging_device=='ct mri'].image_name.drop_duplicates()
image_name_question_processed[image_name_question_processed.image_name == 'JPN-9-48-g001.jpg'].imaging_device

548     unknown
1113    unknown
3777    unknown
Name: imaging_device, dtype: object

In [18]:
logger.debug("Saving the data")
item_to_save = image_name_question_processed
# item_to_save = image_name_question.head(10)

# remove if exists
try:
    os.remove(data_location)
except OSError:
    pass


train_df = image_name_question_processed[(image_name_question_processed.group == 'train') | (image_name_question_processed.group == 'validation')]
test_df = image_name_question_processed[image_name_question_processed.group == 'test']
light = image_name_question_processed[['image_name', 'question', 'answer', 'group', 'path', 'tumor', 'hematoma', 'brain', 'abdomen', 'neck', 'liver', 'imaging_device']]


with VerboseTimer("Saving model training data"):
    light.to_hdf(data_location, 'light', mode='w', data_columns=['image_name', 'imaging_device', 'path'], format='table')
    with HDFStore(data_location) as store:    
        store['data']  = train_df
        store['test']  = test_df
        
size = get_size(data_location)
logger.debug(f"training data's file size was: {size}")



[2019-01-03 23:04:51,431][DEBUG] Saving the data
Saving model training data: 0:00:06.453103
[2019-01-03 23:04:58,039][DEBUG] training data's file size was: 1.18 GB


In [19]:
data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'

In [20]:
# import numpy as np
# d = train_df[train_df.imaging_device.isin(['ct','mri'])]
# print(np.unique(train_df.imaging_device))
# print(np.unique(d.imaging_device))

In [21]:
print('Data saved at:')
f'{data_location}'

Data saved at:


'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'