In [1]:
# %%capture
import IPython
import os
import numpy as np
import pandas as pd
from pandas import HDFStore
import spacy
from keras.utils import to_categorical
import cv2
from collections import defaultdict
import logging
from common.os_utils import File

Using TensorFlow backend.


In [2]:
from vqa_logger import init_log
init_log()
logger = logging.getLogger(__name__)

In [3]:
from common.constatns import train_data, validation_data, data_location, raw_data_location
from common.settings import input_length, embedding_dim, image_size, seq_length, get_nlp
from common.functions import get_highlighted_function_code, get_image,  get_size
from pre_processing.prepare_data import get_text_features, pre_process_raw_data
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[2019-01-19 23:22:26][DEBUG] using embedding vector: en_core_web_lg
[2019-01-19 23:22:27][DEBUG] Got NLP engine (en_core_web_lg)


#### Where get_nlp is defined as:

In [5]:
code = get_highlighted_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [6]:
with HDFStore(raw_data_location) as store:
    image_name_question = store['data']

In [7]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,rjv03401,what does mri show?,lesion at tail of pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
1,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,in distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
2,wjem-11-76f3,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
3,ccr30002-0045-f3,what was normal?,blood supply to the brain,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
4,rjt01904,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...


##### This is just for performance and quick debug cycles! remove before actual trainining:

In [8]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [9]:
code = get_highlighted_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [10]:
code = get_highlighted_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [11]:
code = get_highlighted_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [12]:
from pre_processing.data_enrichment import enrich_data
from pre_processing.data_cleaning import clean_data

orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

  0%|                                                                                                                                                                                                                | 0/2866 [00:00<?, ?it/s][2019-01-19 23:22:29][INFO] consolidating image devices
image device:	mri        : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2866/2866 [00:25<00:00, 110.47it/s]


In [13]:
image_name_question[image_name_question.image_name == '0392-100X-33-350-g002.jpg'].head()
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device
0,rjv03401,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri
1,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri
2,wjem-11-76f3,what do arrows denote noncontrast ct pelvis?,complex fluid collection with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,False,True,False,False,False,False,ct
3,ccr30002-0045-f3,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what was normal?,blood supply to the brain,False,False,False,False,False,False,ct
4,rjt01904,what shows evidence a contaed rupture?,repeat ct abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,False,False,False,True,False,False,ct


In [14]:
groups = image_name_question.groupby('group')
groups.describe()
image_name_question[['imaging_device','image_name']].groupby('imaging_device').describe()

Unnamed: 0_level_0,image_name,image_name,image_name,image_name
Unnamed: 0_level_1,count,unique,top,freq
imaging_device,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ct,3721,1686,1477-7819-6-60-1,7
mra,12,4,jthc-8-58-g001,3
mri,2153,933,crg0005-0583-f01,6
unknown,527,243,TOORTHJ-9-367_F2,6


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [15]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [16]:
logger.debug('----===== Preproceccing train data =====----')
image_locations = train_data.images_path
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[2019-01-19 23:22:55][DEBUG] ----===== Preproceccing train data =====----
[2019-01-19 23:22:55][INFO] Getting answers embedding
[2019-01-19 23:23:26][DEBUG] Answer Embedding: 0:00:31.395067
[2019-01-19 23:23:26][INFO] Getting questions embedding
[2019-01-19 23:24:00][DEBUG] Question Embedding: 0:00:33.548964
[2019-01-19 23:24:00][INFO] Tagging image questions
[2019-01-19 23:24:01][DEBUG] Tagging image questions: 0:00:00.880122
[2019-01-19 23:24:01][DEBUG] Pre processing: 0:01:05.984577
[2019-01-19 23:24:01][DEBUG] Done
[2019-01-19 23:24:01][DEBUG] Pre processing training data: 0:01:06.062170


In [17]:
image_name_question_processed.head()

Unnamed: 0,index,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding,is_imaging_device_question
0,0,rjv03401.jpg,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[-1.8407480716705322, 2.5507988929748535, 0.7...",1
1,1,AIAN-14-313-g002.jpg,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.35850387811660767, 1.4076576232910156, -3....",0
2,2,wjem-11-76f3.jpg,what do arrows denote noncontrast ct pelvis?,complex fluid collection with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,False,True,False,False,False,False,ct,"[[0.16135236620903015, -1.7424618005752563, -1...","[[-1.4648534059524536, 3.1732239723205566, 2.0...",0
3,3,ccr30002-0045-f3.jpg,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what was normal?,blood supply to the brain,False,False,False,False,False,False,ct,"[[1.421677589416504, 1.1374449729919434, 0.465...","[[-2.699403762817383, 1.9192107915878296, 0.21...",0
4,4,rjt01904.jpg,what shows evidence a contaed rupture?,repeat ct abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,False,False,False,True,False,False,ct,"[[3.0298147201538086, -0.026108086109161377, 1...","[[-2.2646989822387695, 2.072265148162842, 0.51...",0


In [40]:
# image_name_question_processed.columns
image_devicees_qa = image_name_question_processed[image_name_question_processed.is_imaging_device_question ==1]
image_devicees_qa.loc[:,['group','question', 'answer','is_imaging_device_question']].head()
# image_devicees_qa.groupby('group').describe()

Unnamed: 0,group,question,answer,is_imaging_device_question
0,train,what does mri show?,tumor at tail pancreas,1
5,train,what does preoperative ct demonstrate?,severe looseng pedicle screws at l3 l5,1
6,train,what does axial contracted ct section show?,hypodense nodular tumor right adrenal gl,1
8,train,what does coronal ct demonstrate?,cisional lumbar hernia,1
10,train,what does coronal reformatted ct demonstrate m...,both distal femurs proximal tibias,1


In [41]:
# len(image_devicees_qa[image_devicees_qa['group']=='test'])

# image_devicees_qa[['group','question','answer','is_imaging_device_question']].head()
# tdf = image_devicees_qa[['group','question','answer','is_imaging_device_question','question_embedding']][image_devicees_qa['group'] == 'test']
# tdf.head()
# for i, r in tdf.iterrows():
#     v =r.question_embedding
#     q = r.question
#     break

# print(q)
# p = 'C:\\Users\\Public\\Documents\\Data\\2018\\imaging_dvices_classifiers\\question_classifier.pickle'
# question_classifier = File.load_pickle(p)
# embedding_input = np.asarray([v[0]])
# question_classifier.predict(embedding_input)


#### Saving the data, so later on we don't need to compute it again

In [42]:
image_name_question_processed.imaging_device.drop_duplicates()
image_name_question_processed[image_name_question_processed.imaging_device=='ct mri'].image_name.drop_duplicates()
image_name_question_processed[image_name_question_processed.image_name == 'JPN-9-48-g001.jpg'].imaging_device

548     unknown
1113    unknown
3777    unknown
Name: imaging_device, dtype: object

In [43]:
logger.debug("Saving the data")
item_to_save = image_name_question_processed
# item_to_save = image_name_question.head(10)

# remove if exists
try:
    os.remove(data_location)
except OSError:
    pass


train_df = image_name_question_processed[(image_name_question_processed.group == 'train') | (image_name_question_processed.group == 'validation')]
test_df = image_name_question_processed[image_name_question_processed.group == 'test']
light = image_name_question_processed[['image_name', 'question', 'answer', 'group', 'path', 'tumor', 'hematoma', 'brain', 'abdomen', 'neck', 'liver', 'imaging_device']]


with VerboseTimer("Saving model training data"):
    light.to_hdf(data_location, 'light', mode='w', data_columns=['image_name', 'imaging_device', 'path'], format='table')
    with HDFStore(data_location) as store:    
        store['data']  = train_df
        store['test']  = test_df
        
size = get_size(data_location)
logger.debug(f"training data's file size was: {size}")



[2019-01-19 23:36:10][DEBUG] Saving the data
[2019-01-19 23:36:17][DEBUG] Saving model training data: 0:00:06.733547
[2019-01-19 23:36:17][DEBUG] training data's file size was: 1.18 GB


In [44]:
data_location

'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'

In [45]:
# import numpy as np
# d = train_df[train_df.imaging_device.isin(['ct','mri'])]
# print(np.unique(train_df.imaging_device))
# print(np.unique(d.imaging_device))

In [46]:
print('Data saved at:')
f'{data_location}'

Data saved at:


'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.h5'