In [1]:
import os
import pandas as pd
from pandas import HDFStore
from nltk.corpus import stopwords
import IPython

In [2]:
from common.functions import get_highlighted_function_code
from common.settings import embedding_dim, seq_length, data_access
# from common.classes import VqaSpecs
from common.utils import VerboseTimer
import vqa_logger 
from common.os_utils import File
from pre_processing.meta_data import create_meta
pd.set_option('display.max_colwidth', -1)

### Preprocessing and creating meta data

Get the data itself, Note the only things required in dataframe are:
1. image_name
2. processed question
3. processed answer


In [9]:
# index	image_name	question	answer	group	path	original_question	original_answer	tumor	hematoma	brain	abdomen	neck	liver	imaging_device	answer_embedding	question_embedding	is_imaging_device_question
df_data = data_access.load_processed_data(columns=['path','question','answer', 'processed_question','processed_answer', 'group','question_category'])        
df_data = df_data[df_data.group.isin(['train','validation', 'test'])]
print(f'Data length: {len(df_data)}')        

[2019-03-26 00:31:30][data_access.api][DEBUG] loading processed data from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.parquet
Data length: 15292


In [10]:
df_data.sample(7)

Unnamed: 0,path,question,answer,processed_question,processed_answer,question_category,group
8678,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic26594.jpg,which organ system is imaged?,genitourinary,organ system imaged?,genitourinary,Organ,train
12689,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic53326.jpg,what is the primary abnormality in this image?,hodgkin's lymphoma,primary abnormality image?,hodgkin's lymphoma,Abnormality,train
9845,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic52810.jpg,what part of the body is being imaged here?,skull and contents,part body imaged here?,skull contents,Organ,train
10933,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic49090.jpg,what is abnormal in the ct scan?,"diffuse idiopathic skeletal hyperostosis (dish), cervical and thoracic spine fractures",abnormal ct scan?,"diffuse idiopathic skeletal hyperostosis (dish), cervical thoracic spine fractures",Abnormality,train
8156,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic34953.jpg,what organ systems can be evaluated with this mri?,skull and contents,organ systems evaluated mri?,skull contents,Organ,train
12356,C:\Users\Public\Documents\Data\2019\train\Train_images\synpic55306.jpg,what is abnormal in the ct scan?,cirrhosis,abnormal ct scan?,cirrhosis,Abnormality,train
216,C:\Users\Public\Documents\Data\2019\test\VQAMed2019_Test_Images\synpic60336.jpg,what plane is this ct scan in?,,plane ct scan in?,,Plane,test


#### We will use this function for creating meta data:

In [4]:
code = get_highlighted_function_code(create_meta,remove_comments=False)
IPython.display.display(code)  

In [5]:
print("----- Creating meta -----")
meta_data_dict = create_meta(df_data)

----- Creating meta -----
[2019-03-26 00:30:48][pre_processing.meta_data][DEBUG] Data frame had 15292 rows
[2019-03-26 00:30:48][pre_processing.meta_data][DEBUG] column processed_question had 71 unique words
[2019-03-26 00:30:48][pre_processing.meta_data][DEBUG] column processed_answer had 2124 unique words


#### Saving the data, so later on we don't need to compute it again

In [6]:
print("----- Saving meta -----")
data_access.save_meta(meta_data_dict)

----- Saving meta -----
[2019-03-26 00:30:48][data_access.api][DEBUG] Meta number of unique answers: 1707
[2019-03-26 00:30:48][data_access.api][DEBUG] Meta number of unique words: 2120


##### Test Loading:

In [7]:
loaded_meta = data_access.load_meta()
answers_meta = loaded_meta['answers']
words_meta = loaded_meta['words']


answers_meta.question_category.describe()
answers_meta.sample(5)
# words_meta.sample(5)

Unnamed: 0,answer,question_category
12312,epidermoid cyst testis,Abnormality
10118,hydranencephaly,Abnormality
12023,intramuscular hemangioma,Abnormality
3725,pa,Plane
10381,testicular microlithiasis,Abnormality
