In [1]:
import os
import pandas as pd
from pandas import HDFStore
from nltk.corpus import stopwords
import IPython

In [2]:
from common.functions import get_highlighted_function_code
from common.constatns import data_location, vqa_specs_location, fn_meta
from common.settings import embedding_dim, seq_length
from common.classes import VqaSpecs
from common.utils import VerboseTimer
from common.os_utils import File
from pre_processing.meta_data import create_meta

### Preprocessing and creating meta data

Get the data itself, Note the only things required in dataframe are:
1. image_name
2. question
3. answer


In [3]:
print(f'loading from:\n{data_location}')
with VerboseTimer("Loading Data"):
    with HDFStore(data_location) as store:
         df_data = store['data']
        
df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head(2)


loading from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
Data length: 5913


Unnamed: 0,index,image_name,question,answer,group,path,original_question,original_answer,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding,is_imaging_device_question
0,0,rjv03401.jpg,what does mri show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,what does mri show?,lesion at tail of pancreas,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[-1.8407480716705322, 2.5507988929748535, 0.7...",1
1,1,AIAN-14-313-g002.jpg,where does axial section mri abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,where does axial section mri abdomen show hypo...,in distal pancreas,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.35850387811660767, 1.4076576232910156, -3....",0


In [4]:
import numpy as np
d = df_data[df_data.imaging_device.isin(['ct','mri'])]
print(np.unique(df_data.imaging_device))
print(np.unique(d.imaging_device))

['ct' 'mra' 'mri' 'unknown']
['ct' 'mri']


#### We will use this function for creating meta data:

In [5]:
code = get_highlighted_function_code(create_meta,remove_comments=False)
IPython.display.display(code)  

In [6]:
print("----- Creating meta -----")
meta_data = create_meta(df_data, fn_meta)

with HDFStore(fn_meta) as metadata_store:           
    df_words = metadata_store['words']
    df_answers = metadata_store['answers']
    df_imaging_device = metadata_store['imaging_devices']
    
df_words.head()


----- Creating meta -----


Unnamed: 0,word
0,ct
1,abd
2,acl
3,aga
4,age


#### Saving the data, so later on we don't need to compute it again

In [7]:
def get_vqa_specs(meta_location):    
    dim = embedding_dim
    s_length = seq_length    
    return VqaSpecs(embedding_dim=dim, 
                    seq_length=s_length, 
                    data_location=os.path.abspath(data_location),
                    meta_data_location=os.path.abspath(meta_location))

vqa_specs = get_vqa_specs(fn_meta)

# Show waht we got...
vqa_specs


VqaSpecs(embedding_dim=384, seq_length=26, data_location=C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5, meta_data_location=C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\meta_data.h5, prediction_df_name=words)

In [8]:
File.dump_pickle(vqa_specs, vqa_specs_location)
print(f"VQA Specs saved to:\n{vqa_specs_location}")

VQA Specs saved to:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\vqa_specs.pkl


##### Test Loading:

In [9]:
loaded_vqa_specs = File.load_pickle(vqa_specs_location)
loaded_vqa_specs

VqaSpecs(embedding_dim=384, seq_length=26, data_location=C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5, meta_data_location=C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\meta_data.h5, prediction_df_name=words)

In [10]:
print (f"vqa_specs_location = '{vqa_specs_location}'".replace('\\','\\\\'))

vqa_specs_location = 'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\vqa_specs.pkl'
