In [1]:
# %%capture
import IPython
import os
from pandas import HDFStore
import pyarrow as pa
import pyarrow.parquet as pq
import logging
from pathlib import Path

In [2]:
logger = logging.getLogger(__name__)

In [3]:
from common.settings import get_nlp, data_access
from common.functions import get_highlighted_function_code, get_image,  get_size
from pre_processing.prepare_data import get_text_features, pre_process_raw_data
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[2019-02-07 22:30:58][DEBUG] using embedding vector: en_core_web_lg
[2019-02-07 22:30:59][DEBUG] Got NLP engine (en_core_web_lg)


#### Where get_nlp is defined as:

In [5]:
code = get_highlighted_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [6]:
image_name_question = data_access.load_raw_input()

[2019-02-07 22:30:59][DEBUG] Loading data from: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\raw_data.h5
[2019-02-07 22:30:59][DEBUG] Loading raw data: 0:00:00.156028


In [7]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,synpic41148,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...
1,synpic43984,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
2,synpic38930,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...
3,synpic52143,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
4,synpic20934,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...


##### This is just for performance and quick debug cycles! remove before actual trainining:

### Aditional functions we will use:

#### get_text_features:

In [8]:
code = get_highlighted_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [9]:
code = get_highlighted_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [10]:
code = get_highlighted_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [11]:
from pre_processing.data_enrichment import enrich_data
from pre_processing.data_cleaning import clean_data

orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

Looking for word: arch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:03<00:00, 24.39it/s]


In [12]:
groups = image_name_question.groupby('group')
groups.describe()

Unnamed: 0_level_0,answer,answer,answer,answer,diagnosis,diagnosis,diagnosis,diagnosis,image_name,image_name,...,processed_question,processed_question,question,question,question,question,question_category,question_category,question_category,question_category
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
train,12792,1552,axial,1558,12792,274,,10569,12792,3200,...,what abnormality is seen in the image?,776,12792,247,what abnormality is seen in the image?,776,12792,4,Plane,3200
validation,2000,470,axial,213,2000,133,,1669,2000,500,...,what abnormality is seen in the image?,133,2000,186,what abnormality is seen in the image?,133,2000,4,Abnormality,500


In [13]:
image_name_question.head()
image_name_question.sample(n=7)

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category
5014,synpic60666,in what plane is this image taken?,coronal,train,C:\Users\Public\Documents\Data\2019\train\Trai...,in what plane is this image taken?,coronal,,Plane
646,synpic100133,how was this image taken?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...,how was this image taken?,xr - plain film,,Modality
11629,synpic29767,what abnormality is seen in the image?,"fibromuscular dysplasia, renal infarct",train,C:\Users\Public\Documents\Data\2019\train\Trai...,what abnormality is seen in the image?,"fibromuscular dysplasia, renal infarct",dysplasia renal infarct,Abnormality
2448,synpic53182,was the ct scan taken with contrast?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,was ct taken with contrast?,no,,Modality
11248,synpic55521,what is abnormal in the ct scan?,splenic laceration,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what is abnormal ct?,splenic laceration,,Abnormality
13211,synpic47751,what type of imaging is this?,mra - mr angiography/venography,validation,C:\Users\Public\Documents\Data\2019\validation...,what type of imaging is this?,mra - mr angiography/venography,,Modality
14123,synpic51357,what organ system is displayed in this mri?,"face, sinuses, and neck",validation,C:\Users\Public\Documents\Data\2019\validation...,what organ system is displayed in this mr?,"face, sinuses, and neck",,Organ


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [14]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [15]:
logger.debug('----===== Preproceccing train data =====----')
image_name_question_processed = pre_process_raw_data(image_name_question)

[2019-02-07 22:31:12][DEBUG] ----===== Preproceccing train data =====----
[2019-02-07 22:31:12][INFO] Getting answers embedding
[2019-02-07 22:32:26][DEBUG] Answer Embedding: 0:01:13.341664
[2019-02-07 22:32:26][INFO] Getting questions embedding
[2019-02-07 22:33:46][DEBUG] Question Embedding: 0:01:20.335247
[2019-02-07 22:33:46][DEBUG] Pre processing: 0:02:34.007682
[2019-02-07 22:33:46][DEBUG] Done


In [16]:
image_name_question_processed.head()

Unnamed: 0,index,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category,answer_embedding,question_embedding
0,0,synpic41148.jpg,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what kind of image is this?,cta - ct angiography,,Modality,"[-0.946086049079895, 0.675370454788208, 1.3840...","[-2.1590447425842285, 3.4943666458129883, 0.19..."
1,1,synpic43984.jpg,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a t1 weighted image?,no,,Modality,"[0.029011979699134827, 1.9719411134719849, 1.5...","[1.099464774131775, 0.1577463150024414, -2.948..."
2,2,synpic38930.jpg,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what type of imaging modality is used to acqui...,us - ultrasound,,Modality,"[1.3107359409332275, -1.1645644903182983, 1.46...","[-2.1146199703216553, 3.82827091217041, -0.040..."
3,3,synpic52143.jpg,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a noncontrast mr?,no,,Modality,"[0.029011979699134827, 1.9719411134719849, 1.5...","[1.242419958114624, 0.27193427085876465, -2.95..."
4,4,synpic20934.jpg,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what type of image modality is this?,xr - plain film,,Modality,"[2.1364431381225586, 0.8484694957733154, -0.96...","[-2.118190050125122, 3.8304693698883057, 0.049..."


In [17]:
image_name_question[image_name_question.image_name == 'synpic52143.jpg'].head()

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,question_category
3,synpic52143.jpg,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a noncontrast mr?,no,,Modality
3203,synpic52143.jpg,which plane is the image shown in?,coronal,train,C:\Users\Public\Documents\Data\2019\train\Trai...,which plane is the image shown in?,coronal,,Plane
6403,synpic52143.jpg,the mri shows what organ system?,spine and contents,train,C:\Users\Public\Documents\Data\2019\train\Trai...,the mr shows what organ system?,spine and contents,,Organ
9603,synpic52143.jpg,what is the primary abnormality in this image?,bone tumor/ chordoma,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what is the primary abnormality in this image?,bone tumor/ chordoma,tumor bone,Abnormality


#### Saving the data, so later on we don't need to compute it again

In [18]:
saved_path = data_access.save_processed_data(image_name_question_processed)

[2019-02-07 22:33:46][DEBUG] Saving the processed data to:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
[2019-02-07 22:34:24][DEBUG] Saving processed data: 0:00:37.814026


In [19]:
print(f'Data saved at:\n{saved_path}')


Data saved at:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
