# Build Dataframe

Load dataset from Bold5000 files and perform additional preprocessing

In [2]:
import h5py
import numpy as np
import pandas as pd
from nltk.corpus import wordnet as wn
import re

## Load HDF5 format file

In [2]:
csi1 = h5py.File('bold5000/ROIs/CSI1/h5/CSI1_ROIs_TR34.h5', 'r')

## Convert HDF5 dictionary into Pandas dataframe

In [3]:
df = pd.DataFrame()
for key in csi1.keys():
    df[key] = list(csi1[key])

## Get mean of each ROI

In [4]:
for col in df:
    df[col + '_mean'] = df[col].apply(np.mean)

## Add image names for each trial

In [5]:
df['image'] = pd.read_csv('bold5000/ROIs/stim_lists/CSI01_stim_lists.txt', header=None)

## Identify dataset from image name

In [6]:
def get_dataset(image_name):
    if re.match(r'^n[0-9]+', image_name):
        return 'ImageNet'
    elif re.match(r'^COCO', image_name):
        return 'COCO'
    else:
        return 'Scene'

In [7]:
df['dataset'] = df.image.map(get_dataset)

In [8]:
df

Unnamed: 0,LHEarlyVis,LHLOC,LHOPA,LHPPA,LHRSC,RHEarlyVis,RHLOC,RHOPA,RHPPA,RHRSC,...,LHOPA_mean,LHPPA_mean,LHRSC_mean,RHEarlyVis_mean,RHLOC_mean,RHOPA_mean,RHPPA_mean,RHRSC_mean,image,dataset
0,"[-0.0007332996202192993, 0.012742927021082125,...","[-0.010073865387316364, -0.016242933708728004,...","[-0.0007190054046507183, 0.015712804006128397,...","[0.009640645082200293, 0.00620421546485009, 0....","[0.00022896090203213546, -0.000101091228016807...","[-0.017438124464051824, -0.016415346568266247,...","[0.017152408363833232, 0.012395771684620905, -...","[0.001060014012804079, 0.0037969468624613123, ...","[-0.017432118504920477, -0.011206780725019462,...","[0.010523216881553926, 0.018616894109178304, -...",...,0.003895,0.006683,0.004827,0.002717,0.001780,0.001302,0.005665,0.001821,n01930112_19568.JPEG,ImageNet
1,"[-0.00408365372866439, 0.024045989311775824, 0...","[-0.006417351988315173, -0.014367437005644718,...","[0.007962392055901293, 0.012567941117704533, -...","[0.009565928987776246, 0.008896546161765401, 0...","[-0.009156423434542763, 0.05101432852634284, -...","[0.023224237413044226, 0.0353701650767394, 0.0...","[-0.02891731908691263, 0.0021785745731643016, ...","[0.002489846229923792, 0.0018723380324673074, ...","[0.007377422572302858, 0.008999538046607606, 0...","[0.00757396440199707, 0.015204065415015091, -0...",...,0.007286,0.008123,-0.002906,0.008042,0.003517,0.008053,0.008152,0.004855,n03733281_29214.JPEG,ImageNet
2,"[-0.018311249668538348, -0.004089993007096237,...","[-0.017487259608276205, -0.004371767173131182,...","[0.0005859454783991519, -0.0022891711986499936...","[0.02933940348470025, -0.012800744352464786, -...","[-0.008140340415565245, -0.04236308780771805, ...","[0.01316850981965961, 0.0014648381254350525, -...","[-0.001998435218477617, 0.0069175267470489386,...","[0.014475440262399785, -0.004527195106635453, ...","[-0.0012363335041166205, 0.010682580183073767,...","[-0.024525205155412268, 0.011177942608016102, ...",...,-0.000814,-0.001542,-0.001116,-0.001614,0.001143,-0.002343,-0.003894,-0.005385,n07695742_5848.JPEG,ImageNet
3,"[-0.0036358480255496263, -0.005777085248153142...","[-0.0014113604454632817, 0.020365468531680607,...","[0.012450292006283163, 0.003063120950630369, -...","[0.016324929197585283, 0.03552697615129019, -0...","[0.004641235352692994, -0.014883833619833267, ...","[-0.027106648625108892, -0.03938701425992952, ...","[0.00798473039401235, 0.006827167514875424, -0...","[0.003354165450599697, 0.002991356168815449, 0...","[0.00015382638565329156, 0.005530484400725495,...","[0.009531465240619173, -0.019319234736257816, ...",...,0.003601,0.005148,0.001336,0.005857,0.001512,0.000648,0.004600,-0.001624,COCO_train2014_000000420713.jpg,COCO
4,"[-0.01030870139350444, -0.016064138259020894, ...","[0.01493559236266165, 0.024060802111018253, 0....","[0.005391067553988603, 0.018118825217784254, 0...","[0.004919489346516212, 0.01058411592366265, 0....","[0.004059522917029128, -0.01289937137776036, 0...","[-0.008160318253698132, -0.005926557660344035,...","[0.003562135896404079, 0.0019933039379347126, ...","[0.010437897433639626, 0.0005288043700255862, ...","[-0.011886311204651664, -0.000482743264098604,...","[0.02548020488763388, 0.010991492131499962, 0....",...,0.006210,0.007275,-0.000906,0.003203,0.004495,0.004283,0.009814,0.004179,COCO_train2014_000000488558.jpg,COCO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5249,"[-0.0009421108411824068, 0.029326541306720053,...","[0.008225730774303013, 0.022443894192539258, 0...","[0.00952613040262936, 0.01535845226382419, 0.0...","[0.006028028679407688, 0.018251020328361615, 0...","[-0.003681877318790481, 0.010612401616277722, ...","[0.009987075216431765, -0.008782943378162274, ...","[0.0024730871155477225, 0.004089197138073718, ...","[0.004663480360500883, 0.007910011620421929, 0...","[-0.001692269373818449, 0.018729292777278313, ...","[0.03420494553750923, -0.009056362591876733, 0...",...,0.008997,0.008606,0.000404,0.011612,0.008266,0.005739,0.010024,0.001500,n03459775_11670.JPEG,ImageNet
5250,"[-0.012078784012960562, -0.02167483808256212, ...","[0.017891098429848193, 0.03322446768832953, -0...","[-0.0019893238813475976, 0.013669359120742835,...","[0.008940782467610554, 0.015124860728704951, 0...","[0.009869190203649778, 0.01680426984422704, 0....","[0.01646279895920465, 0.011708069513139815, 0....","[0.011023134264458374, -0.011279938206026182, ...","[0.009597189636926734, 0.013078594152950972, 0...","[0.025947569767387525, 0.010975654867218708, -...","[-0.004944135186845226, -0.001497345889960939,...",...,0.006987,0.007112,0.004685,0.007980,0.003387,0.008734,0.010876,0.010040,COCO_train2014_000000523236.jpg,COCO
5251,"[-0.008526761330834843, 0.01818614527015821, 0...","[0.03196713277782655, 0.032462068545924976, 0....","[5.766120257791373e-05, 0.009985042729412547, ...","[0.011807984373427725, -0.007408494477070161, ...","[0.002685608772018225, -0.009789752272959345, ...","[0.022452504350459777, 0.02935295038269948, -0...","[-0.0157243476139395, -0.002804142968302295, -...","[0.007481951934809708, -0.0037096256783461807,...","[-0.021392125117081093, -0.012146419819503587,...","[0.0060688730449791495, -0.0058433655634803905...",...,0.004916,0.001651,-0.004846,0.010173,0.009859,0.007659,0.006047,-0.002275,dinosaur4.jpg,Scene
5252,"[0.014976786776645304, 0.004534045271405831, -...","[0.0024258546240347135, 0.000580538431876615, ...","[0.0038296729873518462, 0.009641090531299405, ...","[0.022157192600674853, 0.010643246800740191, 0...","[0.013571905101952844, 0.04397408765866176, 0....","[0.009079294754454192, -0.016052720703118337, ...","[0.006404753842573196, 0.002827573079790548, 0...","[0.004190081004047361, -0.001178182689673088, ...","[0.026032467801431976, 0.011348057289253047, 0...","[-0.000977379690519824, -0.00595607969355465, ...",...,0.007631,0.012284,0.005515,0.007794,0.003719,0.009141,0.008327,0.009764,COCO_train2014_000000244132.jpg,COCO


## Save to file

In [9]:
df.to_pickle('CSI1_ROIs_TR34.pkl')

## Add WordNet information for ImageNet images

In [10]:
imagenet_df = df[df.dataset == 'ImageNet'].copy()

## Process Synsets

The goal of adding these into the dataframe as strings is that future code doesn't need to depend on nltk and the WordNet corpus

In [11]:
# Helper functions
def get_synset(image_name):
    offset = re.search(r'(?<=^n)[0-9]+', image_name).group(0)
    synset = wn.synset_from_pos_and_offset('n', int(offset))
    return synset.name()

def get_hypernym_paths(synset):
    paths = wn.synset(synset).hypernym_paths()
    # Convert to list of strings
    paths = [[hypernym.name() for hypernym in path] for path in paths]
    return paths

def get_ranks(paths):
    # Deep copy of paths
    paths = [path.copy() for path in paths]
    path_sets = []
    while True:
        rank = set([path.pop(0) for path in paths if path])
        
        if rank:
            path_sets.append(rank)
        else:
            break

    return path_sets

In [12]:
imagenet_df['synset'] = imagenet_df.image.map(get_synset)

In [13]:
imagenet_df['hypernym_paths'] = imagenet_df.synset.map(get_hypernym_paths)

In [14]:
imagenet_df['ranks'] = imagenet_df.hypernym_paths.map(get_ranks)

In [15]:
imagenet_df

Unnamed: 0,LHEarlyVis,LHLOC,LHOPA,LHPPA,LHRSC,RHEarlyVis,RHLOC,RHOPA,RHPPA,RHRSC,...,RHEarlyVis_mean,RHLOC_mean,RHOPA_mean,RHPPA_mean,RHRSC_mean,image,dataset,synset,hypernym_paths,ranks
0,"[-0.0007332996202192993, 0.012742927021082125,...","[-0.010073865387316364, -0.016242933708728004,...","[-0.0007190054046507183, 0.015712804006128397,...","[0.009640645082200293, 0.00620421546485009, 0....","[0.00022896090203213546, -0.000101091228016807...","[-0.017438124464051824, -0.016415346568266247,...","[0.017152408363833232, 0.012395771684620905, -...","[0.001060014012804079, 0.0037969468624613123, ...","[-0.017432118504920477, -0.011206780725019462,...","[0.010523216881553926, 0.018616894109178304, -...",...,0.002717,0.001780,0.001302,0.005665,0.001821,n01930112_19568.JPEG,ImageNet,nematode.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
1,"[-0.00408365372866439, 0.024045989311775824, 0...","[-0.006417351988315173, -0.014367437005644718,...","[0.007962392055901293, 0.012567941117704533, -...","[0.009565928987776246, 0.008896546161765401, 0...","[-0.009156423434542763, 0.05101432852634284, -...","[0.023224237413044226, 0.0353701650767394, 0.0...","[-0.02891731908691263, 0.0021785745731643016, ...","[0.002489846229923792, 0.0018723380324673074, ...","[0.007377422572302858, 0.008999538046607606, 0...","[0.00757396440199707, 0.015204065415015091, -0...",...,0.008042,0.003517,0.008053,0.008152,0.004855,n03733281_29214.JPEG,ImageNet,maze.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
2,"[-0.018311249668538348, -0.004089993007096237,...","[-0.017487259608276205, -0.004371767173131182,...","[0.0005859454783991519, -0.0022891711986499936...","[0.02933940348470025, -0.012800744352464786, -...","[-0.008140340415565245, -0.04236308780771805, ...","[0.01316850981965961, 0.0014648381254350525, -...","[-0.001998435218477617, 0.0069175267470489386,...","[0.014475440262399785, -0.004527195106635453, ...","[-0.0012363335041166205, 0.010682580183073767,...","[-0.024525205155412268, 0.011177942608016102, ...",...,-0.001614,0.001143,-0.002343,-0.003894,-0.005385,n07695742_5848.JPEG,ImageNet,pretzel.n.01,"[[entity.n.01, physical_entity.n.01, matter.n....","[{entity.n.01}, {physical_entity.n.01}, {matte..."
5,"[0.008006327274155324, 0.022844701914134347, -...","[0.0005973942654227474, 0.009223013633000406, ...","[0.012418947387206341, 0.003601473854389809, 0...","[0.014288682624057835, -0.002870163645700058, ...","[0.003280111975153305, 0.03159715520041075, 0....","[0.006774101589345393, -0.022699540338286535, ...","[-0.00385907414856795, 0.00226222241341696, 0....","[-0.008522479762203997, 0.012201783962186722, ...","[0.0049679719403497346, 0.0003725234939603063,...","[0.006239764604303685, 0.0214315971739864, 0.0...",...,0.010283,0.007499,0.006386,0.007514,0.000834,n01917289_1429.JPEG,ImageNet,brain_coral.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
6,"[-0.003332402318429617, -0.0057119668511583874...","[-0.023243747891978882, -0.019669040992302107,...","[0.001215857752906173, 0.014544436111422187, 0...","[-0.010715245407507075, -0.0031143930570632695...","[-0.009493682265910859, -0.038222775920328876,...","[0.02133088216843404, 0.013532624752024488, -0...","[-0.005588667915336392, 0.00412134803749096, -...","[0.014580038118250684, 0.002425172980774131, 0...","[0.010722914097136549, 0.018312850460383414, 0...","[0.014684486416297438, -0.01877878929864164, 0...",...,0.000180,0.003841,-0.000011,0.003147,-0.003566,n02108551_26574.JPEG,ImageNet,tibetan_mastiff.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,"[-0.005221619243131347, 0.006236469980865969, ...","[0.02398725076421252, 0.03449611608962513, 0.0...","[-0.0007292933812541807, 0.01598981283906226, ...","[0.0029805676729311456, -0.011216690541489162,...","[0.0033276283326679097, 0.020661913126261363, ...","[-0.010686462401085036, 0.010641627125319277, ...","[-0.009611596213230506, -0.01278596869672243, ...","[-0.004874383745968915, 0.01051393217830262, 0...","[0.0021544770042654735, -0.0018456851426815655...","[-0.00741887390636649, 0.0015939662210172206, ...",...,0.001231,-0.000182,0.007515,0.009306,0.006203,n09421951_5433.JPEG,ImageNet,sandbar.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
5245,"[-0.014397490973256155, 0.005007246801481307, ...","[0.001949767801690881, -0.009336854434384837, ...","[-0.005974795097015942, 0.008286712187628824, ...","[0.005076150260192791, 0.015083498439003775, 0...","[-0.012285786025503268, 0.011055061291882091, ...","[0.009907177032926047, 0.0007473426104999811, ...","[0.002451592935202595, 0.0041825216464569165, ...","[0.0005781563417227699, 0.006029890923498173, ...","[-0.007636358454230615, -0.00791666901577241, ...","[0.006481771524644495, 0.016555514343669447, -...",...,0.003925,0.008507,0.000446,0.002833,-0.009806,n02018207_3567.JPEG,ImageNet,american_coot.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
5247,"[0.012168834435180043, 0.02916689090498855, 0....","[0.018688031005249645, 0.016046747532684855, 0...","[-0.00018501655426165272, 0.013896470837598148...","[0.017529429404220875, 0.006093711965441748, 0...","[0.0032710842421554776, 0.04082133833435561, 0...","[0.00864157322619926, 0.04309468715219355, 0.0...","[-0.005500925676207026, 0.0063158955653292315,...","[-0.006224083855449661, -0.011954001314893214,...","[-0.023262868313581368, -0.029146061686577303,...","[-0.010737078061088397, -0.002406564400707933,...",...,0.003778,0.009884,0.000222,-0.000208,-0.002361,n01692333_12353.JPEG,ImageNet,gila_monster.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."
5248,"[-0.008454626674237728, -0.042242182959429775,...","[-0.021914638134710028, -0.01193683864002826, ...","[0.0028138353932710665, -0.0018118849346542556...","[-0.009217069005517256, 0.00947797310043802, -...","[0.009889217066655569, 0.01016102300366312, -0...","[-0.017311103773502583, -0.03397474262001243, ...","[0.004361534024931888, -0.007090965712854643, ...","[-0.01711795809206414, -0.0013711740891346347,...","[0.00749580089212694, 0.007670999135788175, -0...","[0.019388546016529812, 0.0007423788959994397, ...",...,-0.005147,-0.000122,0.001145,0.002593,0.008858,n03240683_13368.JPEG,ImageNet,drilling_platform.n.01,"[[entity.n.01, physical_entity.n.01, object.n....","[{entity.n.01}, {physical_entity.n.01}, {objec..."


## Save to file

In [16]:
imagenet_df.to_pickle('CSI1_ROIs_TR34_ImageNet.pkl')

## ROI LOC Investigations

In [4]:
csi2 = h5py.File('bold5000/ROIs/CSI2/h5/CSI2_ROIs_TR34.h5', 'r')

In [6]:
csi2.keys()

<KeysViewHDF5 ['LHEarlyVis', 'LHLOC', 'LHOPA', 'LHPPA', 'LHRSC', 'RHEarlyVis', 'RHLOC', 'RHOPA', 'RHPPA', 'RHRSC']>

In [9]:
csi2['LHLOC'][0].shape

(327,)

In [10]:
csi2['LHEarlyVis'][0].shape

(254,)