Set up the running environment and file path for the following data:
    NSD, 
    NSD_beta, 
    exp_design files, 
    COCO annotation files(records the detailed information of the image), 
    COCO caption files(records the textual description of the image)

In [2]:
from transformers import ViltProcessor, ViltModel
from PIL import Image
import requests
import json
from collections import defaultdict
import os
from scipy.io import loadmat
import torch
from torch import nn
import pandas as pd
import numpy as np
from tqdm import tqdm
from torchinfo import summary

cap_lenght = "s"
os.chdir(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
nsd_root = "/data/public/wbc/NSDdataset/"  
stim_root = nsd_root + "nsddata_stimuli/stimuli/nsd/"  # Contains color natural scene images used in NSD experiments
beta_root = nsd_root + "nsddata_betas/ppdata/" 
mask_root = nsd_root + "nsddata/ppdata/"
exp_design_file = nsd_root + "nsddata/experiments/nsd/nsd_expdesign.mat" 
exp_design = loadmat(exp_design_file)
basic_cnt    = exp_design['basiccnt']  # 
shared_idx   = exp_design['sharedix']  # Sort index for shared images
subject_idx  = exp_design['subjectim']   # 8 x 10000, The nsd ID corresponding to each trial (stem) is the same for the first 8x1000 and is the same as the sharefix
trial_order  = exp_design['masterordering']  
stim_pattern = exp_design['stimpattern']  # It consists of 40 sessions x 12 runs x 75 experiments. The element is 0/1, indicating the actual occurrence time of the stimulus test
nsd_stiminfo_file = nsd_root + 'nsddata/experiments/nsd/nsd_stim_info_merged.pkl'
stiminfo = pd.read_pickle(nsd_stiminfo_file)   # Coco ID and some information corresponding to each stimulus image 

annDir = '/data/public/wbc//NSDdataset/coco/'  
imgDir = annDir + 'trainval2017/' # combined folder with train2017 and val2017 png masks/
instances_trn_annFile = annDir + 'annotations_trainval2017/annotations/instances_train2017.json'  # annFile
instances_val_annFile = annDir + 'annotations_trainval2017/annotations/instances_val2017.json'  # annFile
instance_trn_capFile = annDir + 'annotations_trainval2017/annotations/captions_train2017.json'  # capFile
instance_val_capFile = annDir + 'annotations_trainval2017/annotations/captions_val2017.json'  # capFile

Match the images used in the NSD experiment with the COCO dataset to obtain information such as their categories and textual descriptions.

Due to the fact that the Coco dataset provides 5 sentences for each image, long sentences can exceed the maximum length limit of the model. Therefore, we adopt the shortest sentence to minimize the loss of descriptive information as much as possible

In [3]:
# coco 
instance_trn_cap = open(instance_trn_capFile, 'r')
dataset = json.load(instance_trn_cap)
imgIdToCaps = defaultdict(list)
imgIdToImg = defaultdict(list)
if 'annotations' in dataset:
    for ann in dataset['annotations']:
        imgIdToCaps[ann['image_id']].append(ann)
        
if 'images' in dataset:
    for ann in dataset['images']:
        imgIdToImg[ann['id']].append(ann)

dataset = dict()
instance_val_cap = open(instance_val_capFile, 'r')
dataset = json.load(instance_val_cap)

if 'annotations' in dataset:
    for ann in dataset['annotations']:
        imgIdToCaps[ann['image_id']].append(ann)

if 'images' in dataset:
    for ann in dataset['images']:
        imgIdToImg[ann['id']].append(ann)
        
cocoMap = np.ones(shape=(73000), dtype=int) * -1
for j in range(len(subject_idx)):  
    cocoId = np.array(stiminfo['cocoId'])[stiminfo['subject%d'%(j+1)].astype(bool)]  
    nsdId = np.array(stiminfo['nsdId'])[stiminfo['subject%d'%(j+1)].astype(bool)]   
    cocoMap[nsdId] = cocoId


max_str_length = 0
imgIdToCaps_simple = {}  # {cocoid:(str)cap}
for k in imgIdToCaps.keys():
    c_item_list = imgIdToCaps[k]
    cap_list = []
    # Traverse 5 sentences to extract a single image
    for c_item in c_item_list:
        cap = c_item['caption']
        cap_list.append(cap)
    cap_list.sort(reverse=True)
    cap_s = cap_list[0].rstrip() #Shortest
    cap_m = cap_list[2].rstrip() # medium
    cap_l = cap_list[-1].rstrip() # longest
    cap = cap_s
    imgIdToCaps_simple[k] = cap  # {cocoid:(str)cap}

Load a ViLT model based on Coco dataset fine-tuning and use it for encoding and processing of text and image information. Using a pooler_ Output as the result of embedding

In [4]:
processor = ViltProcessor.from_pretrained("/data/public/wbc/data_analysis/nsddata_voxel_analysis/things/ViLT/vilt-b32-finetuned-coco")
model = ViltModel.from_pretrained("/data/public/wbc/data_analysis/nsddata_voxel_analysis/things/ViLT/vilt-b32-finetuned-coco").to(device)
summary(model=model)
result = np.array([])
for nId, cId in tqdm(enumerate(cocoMap)):
    cocoID = cId
    image_file_name = imgIdToImg[cocoID][0]["file_name"] # 输入CocoID
    url = imgDir+image_file_name
    text = imgIdToCaps_simple[cocoID]
    image = Image.open(url)
    inputs = processor(image, text, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)
    outputs = model(**inputs)
    outputs = outputs.pooler_output.cpu().detach().numpy()
    result = np.append(result, outputs)  

result = result.reshape(73000,-1)

Some weights of the model checkpoint at /data/public/wbc/data_analysis/nsddata_voxel_analysis/things/ViLT/vilt-b32-finetuned-coco were not used when initializing ViltModel: ['rank_output.weight', 'rank_output.bias']
- This IS expected if you are initializing ViltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
360it [00:19, 24.69it/s]




13880it [12:47, 14.12it/s]

In [None]:
# np.save(os.path.join(os.getcwd(), "ViLT_embdding_{}.npy").format(cap_lenght), result)