In [1]:
# %%capture
import IPython
import os
from pandas import HDFStore
import pyarrow as pa
import pyarrow.parquet as pq
import logging
from pathlib import Path

In [2]:
logger = logging.getLogger(__name__)

In [3]:
from common.settings import get_nlp, data_access
from common.functions import get_highlighted_function_code, get_image,  get_size
from pre_processing.prepare_data import get_text_features, pre_process_raw_data
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[2019-02-09 23:41:43][DEBUG] using embedding vector: en_core_web_lg
[2019-02-09 23:41:43][DEBUG] Got NLP engine (en_core_web_lg)


#### Where get_nlp is defined as:

In [5]:
code = get_highlighted_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [6]:
image_name_question = data_access.load_raw_input()

[2019-02-09 23:41:44][DEBUG] Loading data from: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\raw_data.h5
[2019-02-09 23:41:44][DEBUG] Loading raw data: 0:00:00.124159


In [7]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,synpic41148,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...
1,synpic43984,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
2,synpic38930,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...
3,synpic52143,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
4,synpic20934,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...


##### This is just for performance and quick debug cycles! remove before actual trainining:

### Aditional functions we will use:

#### get_text_features:

In [8]:
code = get_highlighted_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [9]:
code = get_highlighted_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [10]:
code = get_highlighted_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [11]:
from pre_processing.data_enrichment import enrich_data
from pre_processing.data_cleaning import clean_data

orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

Looking for word: arch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:03<00:00, 27.23it/s]


In [12]:
groups = image_name_question.groupby('group')
groups.describe()

Unnamed: 0_level_0,answer,answer,answer,answer,diagnosis,diagnosis,diagnosis,diagnosis,image_name,image_name,...,processed_question,processed_question,question,question,question,question,question_category,question_category,question_category,question_category
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
train,12792,1552,axial,1558,12792,274,,10569,12792,3200,...,what abnormality is seen in the image?,776,12792,247,what abnormality is seen in the image?,776,12792,4,Modality,3200
validation,2000,470,axial,213,2000,133,,1669,2000,500,...,what abnormality is seen in the image?,133,2000,186,what abnormality is seen in the image?,133,2000,4,Plane,500


In [13]:
image_name_question.head()
image_name_question.sample(n=7)

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category
9591,synpic26843,what organ system is visualized?,heart and great vessels,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what organ system is visualized?,heart and great vessels,,Organ
5494,synpic54081,what image plane is this?,axial,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what image plane is this?,axial,,Plane
10810,synpic52357,what is abnormal in the mri?,cervical hemangioblastoma,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what is abnormal in the mr?,cervical hemangioblastoma,,Abnormality
1950,synpic40992,is this an mri image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this an mr image?,no,,Modality
10187,synpic41789,what is most alarming about this mri?,brain abscess vs. cerebritis,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what is most alarming about this mr?,brain abscess vs. cerebritis,abscess,Abnormality
461,synpic56691,what type of imaging was used?,mr - pdw proton density,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what type of imaging was used?,mr - pdw proton density,,Modality
14184,synpic40559,what organ system is imaged?,spine and contents,validation,C:\Users\Public\Documents\Data\2019\validation...,what organ system is imaged?,spine and contents,,Organ


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [14]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [15]:
logger.debug('----===== Preproceccing train data =====----')
image_name_question_processed = pre_process_raw_data(image_name_question)

[2019-02-09 23:41:55][DEBUG] ----===== Preproceccing train data =====----
[2019-02-09 23:41:55][INFO] Answer: removing stop words and tokenizing
[2019-02-09 23:41:55][DEBUG] Answer Tokenizing: 0:00:00.014147
[2019-02-09 23:41:55][INFO] Question: removing stop words and tokenizing
[2019-02-09 23:41:55][DEBUG] Question Tokenizing: 0:00:00.021958
[2019-02-09 23:41:55][INFO] Getting answers embedding
[2019-02-09 23:43:04][DEBUG] Answer Embedding: 0:01:08.607068
[2019-02-09 23:43:04][INFO] Getting questions embedding
[2019-02-09 23:44:16][DEBUG] Question Embedding: 0:01:12.028090
[2019-02-09 23:44:16][DEBUG] Pre processing: 0:02:21.025855
[2019-02-09 23:44:16][DEBUG] Done


In [16]:
image_name_question_processed.sample(5)

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category,answer_embedding,question_embedding
12598,synpic54732.jpg,what is abnormal in the ct scan?,paraganglioma,train,C:\Users\Public\Documents\Data\2019\train\Trai...,abnormal ct?,paraganglioma,,Abnormality,"[-0.6973563432693481, 1.132625937461853, 3.071...","[0.2237924486398697, -1.8565819263458252, 0.31..."
5950,synpic45945.jpg,in what plane is this x-ray captured?,ap,train,C:\Users\Public\Documents\Data\2019\train\Trai...,plane x-ray captured?,ap,,Plane,"[1.4533283710479736, -0.9862573146820068, 4.03...","[1.9154289960861206, 1.7209750413894653, 2.659..."
3039,synpic30911.jpg,what type of imaging was used?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...,type imaging used?,xr - plain film,,Modality,"[2.1364431381225586, 0.8484694957733154, -0.96...","[2.609076976776123, 4.5359206199646, -0.695239..."
9550,synpic35688.jpg,what organ system is pictured here?,heart and great vessels,train,C:\Users\Public\Documents\Data\2019\train\Trai...,organ system pictured here?,heart great vessels,,Organ,"[2.8541481494903564, 2.0048623085021973, -1.49...","[-0.06395956873893738, -1.3520504236221313, -2..."
5302,synpic23087.jpg,what is the plane?,axial,train,C:\Users\Public\Documents\Data\2019\train\Trai...,plane?,axial,,Plane,"[-1.3220698833465576, -0.9305600523948669, 0.8...","[0.24050864577293396, 0.472703218460083, 0.173..."


In [17]:
image_name_question_processed[image_name_question_processed.image_name == 'synpic52143.jpg'].head()

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category,answer_embedding,question_embedding
3,synpic52143.jpg,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,noncontrast mr?,,,Modality,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.5954318046569824, 2.331437110900879, -0.321..."
3203,synpic52143.jpg,which plane is the image shown in?,coronal,train,C:\Users\Public\Documents\Data\2019\train\Trai...,plane image shown in?,coronal,,Plane,"[-2.5162551403045654, -0.6533107757568359, 0.8...","[1.563550591468811, 1.2665960788726807, 0.6431..."
6403,synpic52143.jpg,the mri shows what organ system?,spine and contents,train,C:\Users\Public\Documents\Data\2019\train\Trai...,mr shows organ system?,spine contents,,Organ,"[1.7562930583953857, 0.6799577474594116, 0.671...","[0.5471814274787903, -0.6695543527603149, 2.93..."
9603,synpic52143.jpg,what is the primary abnormality in this image?,bone tumor/ chordoma,train,C:\Users\Public\Documents\Data\2019\train\Trai...,primary abnormality image?,bone tumor/ chordoma,bone tumor,Abnormality,"[2.0147864818573, 0.44317686557769775, -0.7659...","[2.251065969467163, -0.7645939588546753, -2.76..."


#### Saving the data, so later on we don't need to compute it again

In [18]:
saved_path = data_access.save_processed_data(image_name_question_processed)

[2019-02-09 23:44:16][DEBUG] Saving the processed data to:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
[2019-02-09 23:44:51][DEBUG] Saving processed data: 0:00:34.859684


In [19]:
print(f'Data saved at:\n{saved_path}')

Data saved at:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
