In [1]:
import os
import pandas as pd
from pandas import HDFStore

In [2]:
# Pre process results files
fn_meta            = os.path.abspath('data/meta_data.json')

In [3]:
from common.constatns import train_data, validation_data, raw_data_location

### Preprocessing and creating meta data

Get the data itself, Note the only things required in dataframe are:
1. image_name
2. question
3. answer


In [4]:
with HDFStore(raw_data_location) as store:
     df_data = store['data']
        
df_data = df_data[df_data.group.isin(['train','validation'])]
print(f'Data length: {len(df_data)}')        
df_data.head()


Data length: 5913


Unnamed: 0,image_name,question,answer,group,path
0,rjv03401,what does mri show?,lesion at tail of pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
1,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,in distal pancreas,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
2,wjem-11-76f3,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
3,ccr30002-0045-f3,what was normal?,blood supply to the brain,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...
4,rjt01904,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,train,C:\Users\Public\Documents\Data\2018\VQAMed2018...


We will use this function for creating meta data:

In [5]:
from vqa_logger import logger 
import itertools
import string
from common.os_utils import File #This is a simplehelper file of mine...

def create_meta(df):
        
        print(f"Dataframe had {len(df)} rows")
        def get_unique_words(col):
            single_string = " ".join(df[col])
            exclude = set(string.punctuation)
            s_no_panctuation = ''.join(ch for ch in single_string if ch not in exclude)
            unique_words = set(s_no_panctuation.split(" ")).difference({'',' '})
            print("column {0} had {1} unique words".format(col,len(unique_words)))
            return unique_words

        cols = ['question', 'answer']
        df_unique_words = set(itertools.chain.from_iterable([get_unique_words(col) for col in cols]))
        df_unique_answers = set(df['answer'])        

        metadata = {}
        metadata['ix_to_word'] = {str(word): int(i) for i, word in enumerate(df_unique_words)}
        metadata['ix_to_ans'] = {i:ans for i, ans in enumerate(df_unique_answers)}
        metadata['ans_to_ix'] = {ans:i for i, ans in enumerate(df_unique_answers)}
                
        
        #------------------- Asserts
        answers = metadata['ix_to_ans'].values()
        words = metadata['ix_to_word'].values()
        
        assert len(set(answers)) == len(answers), 'Got duplicate answers'
        assert len(set(words)) == len(words), 'Got duplicate words'        
        
        print("Meta number of unique answers: {0}".format(len(set(metadata['ix_to_ans'].values()))))
        print("Meta number of unique words: {0}".format(len(set(metadata['ix_to_word'].values()))))

       
        return metadata

In [6]:
print("----- Creating meta -----")
meta_data = create_meta(df_data)

# pd.DataFrame(meta_data).head()
meta_data.keys()

----- Creating meta -----
Dataframe had 5913 rows
column question had 3374 unique words
column answer had 3360 unique words
Meta number of unique answers: 4906
Meta number of unique words: 3727


dict_keys(['ix_to_word', 'ix_to_ans', 'ans_to_ix'])

In [7]:
File.dump_json(meta_data,fn_meta)
print(f"Meta file available at: {fn_meta}")

Meta file available at: C:\Users\avitu\Documents\GitHub\VQA-MED\VQA-MED\Cognitive-LUIS-Windows-master\Sample\VQA.Python\data\meta_data.json
