In [1]:
# %%capture
import IPython
import os
import numpy as np
import pandas as pd
from pandas import HDFStore
import spacy
from keras.utils import to_categorical
import cv2
from collections import defaultdict

from vqa_logger import logger
from common.os_utils import File

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from common.constatns import train_data, validation_data, data_location, raw_data_location
from common.settings import input_length, embedding_dim, image_size, seq_length, get_nlp
from common.functions import get_highlited_function_code, get_image, get_text_features, pre_process_raw_data, get_size
from common.utils import VerboseTimer

### Preparing the data for training

#### Getting the nlp engine

In [3]:
nlp = get_nlp()

[10:49:49][DEBUG] using embedding vector: en_core_web_sm
[10:49:55][DEBUG] Got embedding


#### Where get_nlp is defined as:

In [4]:
code = get_highlited_function_code(get_nlp,remove_comments=True)
IPython.display.display(code)

In [5]:
with HDFStore(raw_data_location) as store:
    image_name_question = store['data']
# df_train = image_name_question[image_name_question.group == 'train']
# df_val = image_name_question[image_name_question.group == 'validation']

# from parsers.VQA18 import Vqa18Base
# df_train = Vqa18Base.get_instance(train_data.processed_xls).data            
# df_val = Vqa18Base.get_instance(validation_data.processed_xls).data

##### This is just for performance and quick debug cycles! remove before actual trainining:

In [6]:
# image_name_question = image_name_question.head(5)
# image_name_question_val = image_name_question_val.head(5)

### Aditional functions we will use:

#### get_text_features:

In [7]:
code = get_highlited_function_code(get_text_features,remove_comments=True)
IPython.display.display(code)

#### get_image:

In [8]:
code = get_highlited_function_code(get_image,remove_comments=True)
IPython.display.display(code)

#### pre_process_raw_data:

In [9]:
code = get_highlited_function_code(pre_process_raw_data,remove_comments=True)
IPython.display.display(code)


### Clean and enrich the data

In [10]:
from common.functions import enrich_data, clean_data
image_name_question = clean_data(image_name_question)
image_name_question = enrich_data(image_name_question)

In [11]:
image_name_question.head()

Unnamed: 0,image_name,question,answer,group,path,tumor,hematoma,brain,abdomen,neck,liver,imaging_device
0,rjv03401,what does MRI show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,True,False,False,False,False,False,mri
1,AIAN-14-313-g002,where does axial seCTion MRI abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,True,False,False,mri
2,wjem-11-76f3,what do arrows denote noncontrast CT pelvis?,complex fluid colleCTion with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,True,False,False,False,False,ct
3,ccr30002-0045-f3,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,False,False,False,mri
4,rjt01904,what shows evidence a contaed rupture?,repeat CT abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,True,False,False,ct


In [12]:
image_name_question.groupby('group').describe()
image_name_question[['imaging_device','image_name']].groupby('imaging_device').describe()

Unnamed: 0_level_0,image_name,image_name,image_name,image_name
Unnamed: 0_level_1,count,unique,top,freq
imaging_device,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
both,30,14,1746-160X-7-20-2,4
ct,2787,1354,1865-1380-4-64-4,6
mri,3087,1265,1477-7819-6-60-1,7
unknown,509,233,TOORTHJ-9-367_F2,6


### Do the actual pre processing
Note:  
This might take a while...

In [13]:
# # # # RRR
# # # logger.debug('Getting answers embedding')
# df = image_name_question
# df['l'] = df.answer.apply(lambda a: len(str(a)))
# df[df.l > 2].sort_values('l')
# # print(len(df[(df.answer == np.nan) | (df.question == np.nan)]))


# # df['answer'].apply(lambda q: get_text_features(q))
# # # a= df['answer'].apply(lambda q: 0 if q == np.nan else 1)
# # # sum(a), len(a), len(image_name_question)

# import json
# # json.load(open)
# a = df[df.group == 'test']['answer'].values[0]
# type(a)




In [14]:
logger.debug('----===== Preproceccing train data =====----')
image_locations = train_data.images_path
with VerboseTimer("Pre processing training data"):
    image_name_question_processed = pre_process_raw_data(image_name_question)

[10:49:58][DEBUG] ----===== Preproceccing train data =====----
[10:49:58][DEBUG] Getting answers embedding
[10:51:01][DEBUG] Getting questions embedding
[10:52:21][DEBUG] Getting image features
[10:53:02][DEBUG] Done
Pre processing training data: 0:03:04.138849


In [15]:
# logger.debug('----===== Preproceccing validation data =====----')
# image_locations = validation_data.images_path
# with VerboseTimer("Pre processing validation data"):
#     image_name_question_val = pre_process_raw_data(image_name_question_val, image_locations)

#### Saving the data, so later on we don't need to compute it again

In [29]:
logger.debug("Saving the data")
item_to_save = image_name_question_processed
# item_to_save = image_name_question.head(10)

# remove if exists
try:
    os.remove(data_location)
except OSError:
    pass


train_df = image_name_question_processed[(image_name_question_processed.group == 'train') | (image_name_question_processed.group == 'validation')]
test_df = image_name_question_processed[image_name_question_processed.group == 'test']
light = image_name_question_processed[['image_name', 'question', 'answer', 'group', 'path', 'tumor', 'hematoma', 'brain', 'abdomen', 'neck', 'liver', 'imaging_device']]


with VerboseTimer("Saving model training data"):
    light.to_hdf(data_location, 'light', mode='w', data_columns=['image_name', 'imaging_device', 'path'], format='table')
    with HDFStore(data_location) as store:    
        store['data']  = train_df
        store['test']  = test_df
        
size = get_size(data_location)
logger.debug(f"training data's file size was: {size}")



[11:08:23][DEBUG] Saving the data
Saving model training data: 0:00:15.991652
[11:08:39][DEBUG] training data's file size was: 2.04 GB


In [20]:
print('Data saved at:')
f'{data_location}'

Data saved at:


'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\Cognitive-LUIS-Windows-master\\Sample\\VQA.Python\\data\\model_input.h5'