In [1]:
import os
import pandas as pd
from pandas import HDFStore

In [2]:
# Pre process results files
fn_meta            = os.path.abspath('data/meta_data.json')

In [3]:
from common.constatns import data_location, vqa_specs_location
from common.settings import embedding_dim, seq_length
from common.classes import VqaSpecs

### Preprocessing and creating meta data

Get the data itself, Note the only things required in dataframe are:
1. image_name
2. question
3. answer


In [4]:
print(f'loading from:\n{data_location}')
with HDFStore(data_location) as store:
     df_data = store['data']
        
df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head()


loading from:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\model_input.h5
Data length: 5913


Unnamed: 0,image_name,question,answer,group,path,tumor,hematoma,brain,abdomen,neck,liver,imaging_device,answer_embedding,question_embedding,image
0,rjv03401.jpg,what does MRI show?,tumor at tail pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,True,False,False,False,False,False,mri,"[[3.8335671424865723, 0.9851416349411011, 0.60...","[[-2.1287951469421387, 2.4069643020629883, 0.9...","[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
1,AIAN-14-313-g002.jpg,where does axial seCTion MRI abdomen show hypo...,distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,True,False,False,mri,"[[0.9880439043045044, 0.907943844795227, -1.30...","[[0.329662561416626, 1.4127026796340942, -3.38...","[[[9, 9, 9], [9, 9, 9], [10, 10, 10], [9, 9, 9..."
2,wjem-11-76f3.jpg,what do arrows denote noncontrast CT pelvis?,complex fluid colleCTion with layerg consisten...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,True,False,False,False,False,ct,"[[0.03861135244369507, -1.7372519969940186, -1...","[[-1.4648534059524536, 3.1732239723205566, 2.0...","[[[45, 45, 45], [23, 23, 23], [23, 23, 23], [2..."
3,ccr30002-0045-f3.jpg,what was normal?,blood supply to bra,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,False,False,False,ct,"[[1.421677589416504, 1.1374449729919434, 0.465...","[[-2.699403762817383, 1.9192107915878296, 0.21...","[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
4,rjt01904.jpg,what shows evidence a contaed rupture?,repeat CT abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...,False,False,False,True,False,False,ct,"[[1.9254751205444336, 1.511404275894165, 0.462...","[[-2.2646989822387695, 2.072265148162842, 0.51...","[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."


In [5]:
import numpy as np
d = df_data[df_data.imaging_device.isin(['ct','mri'])]
print(np.unique(df_data.imaging_device))
print(np.unique(d.imaging_device))

['ct' 'mri' 'unknown']
['ct' 'mri']


#### We will use this function for creating meta data:

In [6]:
from vqa_logger import logger 
import itertools
import string
from common.os_utils import File #This is a simplehelper file of mine...

def create_meta(df):
        
        print(f"Dataframe had {len(df)} rows")
        def get_unique_words(col):
            single_string = " ".join(df[col])
            exclude = set(string.punctuation)
            s_no_panctuation = ''.join(ch for ch in single_string if ch not in exclude)
            unique_words = set(s_no_panctuation.split(" ")).difference({'',' '})
            print("column {0} had {1} unique words".format(col,len(unique_words)))
            return unique_words

        cols = ['question', 'answer']
        df_unique_words = set(itertools.chain.from_iterable([get_unique_words(col) for col in cols]))
        df_unique_answers = set(df['answer'])        
        
        df_unique_imaging_devices = set(df['imaging_device'])        

        metadata = {}
        metadata['ix_to_word'] = {str(word): int(i) for i, word in enumerate(df_unique_words)}
        metadata['ix_to_ans'] = {i:ans for i, ans in enumerate(df_unique_answers)}
        metadata['ans_to_ix'] = {ans:i for i, ans in enumerate(df_unique_answers)}
        
        metadata['img_device_to_ix'] = {ans:i for i, ans in enumerate(df_unique_imaging_devices)}
        metadata['ix_to_img_device'] = {i:ans for i, ans in enumerate(df_unique_imaging_devices)}
                
        
        #------------------- Asserts
        answers = metadata['ix_to_ans'].values()
        words = metadata['ix_to_word'].values()
        
        assert len(set(answers)) == len(answers), 'Got duplicate answers'
        assert len(set(words)) == len(words), 'Got duplicate words'        
        #---------------------------
        
        print("Meta number of unique answers: {0}".format(len(set(metadata['ix_to_ans'].values()))))
        print("Meta number of unique words: {0}".format(len(set(metadata['ix_to_word'].values()))))

       
        return metadata

In [7]:
print("----- Creating meta -----")
meta_data = create_meta(df_data)

# pd.DataFrame(meta_data).head()
meta_data.keys()

----- Creating meta -----
Dataframe had 5913 rows
column question had 3346 unique words
column answer had 3333 unique words
Meta number of unique answers: 4753
Meta number of unique words: 3694


dict_keys(['ix_to_word', 'ix_to_ans', 'ans_to_ix', 'img_device_to_ix', 'ix_to_img_device'])

In [8]:
File.dump_json(meta_data,fn_meta)
print(f"Meta file available at: {fn_meta}")

Meta file available at: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\meta_data.json


#### Saving the data, so later on we don't need to compute it again

In [9]:
def get_vqa_specs(meta_data):    
    dim = embedding_dim
    s_length = seq_length    
    return VqaSpecs(embedding_dim=dim, seq_length=s_length, data_location=data_location,meta_data=meta_data)

vqa_specs = get_vqa_specs(meta_data)

# Show waht we got...
s = str(vqa_specs)
s[:s.index('meta_data=')+10]

"VqaSpecs(embedding_dim=384, seq_length=26, data_location='C:\\\\Users\\\\avitu\\\\Documents\\\\GitHub\\\\VQA-MED\\\\VQA-MED\\\\VQA.Python\\\\data\\\\model_input.h5', meta_data="

In [10]:
File.dump_pickle(vqa_specs, vqa_specs_location)
logger.debug(f"VQA Specs saved to:\n{vqa_specs_location}")

[14:19:58][DEBUG] VQA Specs saved to:
C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\VQA.Python\data\vqa_specs.pkl


In [11]:
print (f"vqa_specs_location = '{vqa_specs_location}'".replace('\\','\\\\'))

vqa_specs_location = 'C:\\Users\\avitu\\Documents\\GitHub\\VQA-MED\\VQA-MED\\VQA.Python\\data\\vqa_specs.pkl'
