In [1]:
# %%capture
import IPython
import os
from pandas import HDFStore
import pyarrow as pa
import pyarrow.parquet as pq
import logging
from pathlib import Path

In [2]:
import vqa_logger 
logger = logging.getLogger(__name__)

In [3]:
from common.constatns import raw_data_location, data_location
from common.settings import get_nlp, data_access
from common.functions import get_highlighted_function_code, get_image,  get_size
from pre_processing.prepare_data import get_text_features, pre_process_raw_data
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [4]:
nlp = get_nlp()

[32m[2019-02-04 21:47:01][DEBUG][0m [32musing embedding vector: en_core_web_lg[0m
[32m[2019-02-04 21:47:02][DEBUG][0m [32mGot NLP engine (en_core_web_lg)[0m


#### Where get_nlp is defined as:

In [5]:
code = get_highlighted_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [7]:
image_name_question = data_access.load_raw_input()

In [8]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path
0,synpic41148,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...
1,synpic43984,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
2,synpic38930,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...
3,synpic52143,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...
4,synpic20934,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...


##### This is just for performance and quick debug cycles! remove before actual trainining:

In [9]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [10]:
code = get_highlighted_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [11]:
code = get_highlighted_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [12]:
code = get_highlighted_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [13]:
from pre_processing.data_enrichment import enrich_data
from pre_processing.data_cleaning import clean_data

orig_image_name_question = image_name_question.copy()
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

Looking for word: arch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:03<00:00, 25.64it/s]
Looking for word: breast: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 33.64it/s]
Looking for word: mammograph: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 24.79it/s]
  0%|                                                                                                                                                                                                                | 0/3700 [00:00<?, ?it/s]

[32m[2019-02-04 21:47:16][INFO][0m consolidating image devices


image device:	mr         : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3700/3700 [00:53<00:00, 69.28it/s]




In [14]:
groups = image_name_question.groupby('group')
groups.describe()
image_name_question[['imaging_device','image_name']].groupby('imaging_device').describe()

Unnamed: 0_level_0,image_name,image_name,image_name,image_name
Unnamed: 0_level_1,count,unique,top,freq
imaging_device,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
angiogram,396,99,synpic59107,4
ct,4283,1071,synpic41341,4
mammograph,128,32,synpic20049,4
mr,5649,1413,synpic45877,4
unknown,1311,328,synpic43955,4
us,862,216,synpic24249,4
xr,2163,541,synpic50593,4


### Do the actual pre processing

#### If running in an exported notebook, use the following:
(indent everything to be under the main guard) - for avoiding recursive spawning of processes

In [15]:
from multiprocessing import freeze_support
if __name__ == '__main__':
    print('in main')
    freeze_support()

in main


Note:  
This might take a while...

In [16]:
logger.debug('----===== Preproceccing train data =====----')
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[32m[2019-02-04 21:48:09][DEBUG][0m [32m----===== Preproceccing train data =====----[0m
[32m[2019-02-04 21:48:10][INFO][0m Getting answers embedding
[32m[2019-02-04 21:49:24][DEBUG][0m [32mAnswer Embedding: 0:01:14.546679[0m
[32m[2019-02-04 21:49:24][INFO][0m Getting questions embedding
[32m[2019-02-04 21:50:45][DEBUG][0m [32mQuestion Embedding: 0:01:21.035802[0m
[32m[2019-02-04 21:50:45][DEBUG][0m [32mPre processing: 0:02:35.817929[0m
[32m[2019-02-04 21:50:45][DEBUG][0m [32mDone[0m
[32m[2019-02-04 21:50:45][DEBUG][0m [32mPre processing training data: 0:02:35.824905[0m


In [17]:
image_name_question_processed.head()

Unnamed: 0,index,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,locations,imaging_device,answer_embedding,question_embedding
0,0,synpic41148.jpg,what kind of image is this?,cta - ct angiography,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what kind of image is this?,cta - ct angiography,,,ct,"[-0.946086049079895, 0.675370454788208, 1.3840...","[-2.1590447425842285, 3.4943666458129883, 0.19..."
1,1,synpic43984.jpg,is this a t1 weighted image?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a t1 weighted image?,no,,,unknown,"[0.029011979699134827, 1.9719411134719849, 1.5...","[1.099464774131775, 0.1577463150024414, -2.948..."
2,2,synpic38930.jpg,what type of imaging modality is used to acqui...,us - ultrasound,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what type of imaging modality is used to acqui...,us - ultrasound,,,us,"[1.3107359409332275, -1.1645644903182983, 1.46...","[-2.1146199703216553, 3.82827091217041, -0.040..."
3,3,synpic52143.jpg,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a noncontrast mr?,no,,,mr,"[0.029011979699134827, 1.9719411134719849, 1.5...","[1.242419958114624, 0.27193427085876465, -2.95..."
4,4,synpic20934.jpg,what type of image modality is this?,xr - plain film,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what type of image modality is this?,xr - plain film,,,xr,"[2.1364431381225586, 0.8484694957733154, -0.96...","[-2.118190050125122, 3.8304693698883057, 0.049..."


In [18]:
image_name_question[image_name_question.image_name == 'synpic52143.jpg'].head()

Unnamed: 0,image_name,question,answer,group,path,processed_question,processed_answer,diagnosis,locations,imaging_device
3,synpic52143.jpg,is this a noncontrast mri?,no,train,C:\Users\Public\Documents\Data\2019\train\Trai...,is this a noncontrast mr?,no,,,mr
3203,synpic52143.jpg,which plane is the image shown in?,coronal,train,C:\Users\Public\Documents\Data\2019\train\Trai...,which plane is the image shown in?,coronal,,,mr
6403,synpic52143.jpg,the mri shows what organ system?,spine and contents,train,C:\Users\Public\Documents\Data\2019\train\Trai...,the mr shows what organ system?,spine and contents,,spine and contents,mr
9603,synpic52143.jpg,what is the primary abnormality in this image?,bone tumor/ chordoma,train,C:\Users\Public\Documents\Data\2019\train\Trai...,what is the primary abnormality in this image?,bone tumor/ chordoma,tumor bone,,mr


#### Saving the data, so later on we don't need to compute it again

### TODO: need to add question classification taking in consideration 2019 data

In [20]:
data_access.save_processed_data(image_name_question_processed)

[32m[2019-02-04 21:50:45][DEBUG][0m [32mSaving the data[0m
[32m[2019-02-04 21:51:25][DEBUG][0m [32mSaving model training data: 0:00:39.728983[0m
[32m[2019-02-04 21:51:25][DEBUG][0m [32mtraining data's file size was: 0 B[0m


In [21]:
print('Data saved at:')
f'{data_location}'

Data saved at:


'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\model_input.parquet'