# Create tardb for AVFP fear runs

creating tardb for AVFP for one condition

2022 June
Yiyu



In [1]:
# check we're in our env (*)
%conda env list

# conda environments:
#
HTFATorch                /home/wang.yiyu/.conda/envs/HTFATorch
NTFA_env3             *  /home/wang.yiyu/.conda/envs/NTFA_env3
base                     /shared/centos7/anaconda3/3.7


Note: you may need to restart the kernel to use updated packages.


In [2]:
# path for the NTFA package
NTFA_path = "/work/abslab/NTFA_packages/NTFADegeneracy/"

import sys
sys.path.append(NTFA_path)
import htfa_torch.niidb as niidb
import htfa_torch.utils as utils
import htfa_torch.tardb as tardb
import logging
import numpy as np
import pandas as pd
import glob
import os
import webdataset as wds
import torch

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)



In [3]:
nifti_dir = '/work/abslab/AVFP/denoised/'
logfiles_dir = '/work/abslab/AVFP/logfiles/AffVidsNovel_logfiles/'

mask_dir = '/home/wang.yiyu/AVFP/masks/'
base_dir = '/work/abslab/Yiyu/NTFA_AVFP/'

In [4]:
# set parameters:

TASK_ONSET_DELAY = 5 # HRF-lag delay in seconds to add to onset/offset

TASK_TR = 0.8 #seconds per TR

exclude_tasks = ['rest','Social','Spiders'] #used for z-scoring, but don't want to model as a condition


# define a subset of participants to analyze:
# if all, set subs to 'All'
# if a subset, provide the number (int) of participants to be included
subs = 20

# using GM (and SNR) or SNR only?
mask_type = 'GM' #'GMandSNR' #'GM', 'SNR'


# define condition
condition = 'HeightsOnly'


log_file_headers = np.array(['video_name','video_number','video_category','novel_vs_familiar',
                   'run_number','video_onset','video_offset','duration_method1','duration_method2',
                   'fear_rating_onset','fear_rating','fear_rating_RT',
                   'arousal_rating_onset','arousal_rating','arousal_rating_RT',
                   'valence_rating_onset','valence_rating','valence_rating_RT'])

In [5]:
# function to getch nifti files per subject and run:


def nifti_filename(subject, run):
    AVFP_FILENAME_TEMPLATE = f'{subject}/sub-{subject}_run-{run}_AVFP_denoised_novideoregs.nii.gz'
    return nifti_dir + AVFP_FILENAME_TEMPLATE

# check:
print('\nexample file name:',nifti_filename(110,3),'\n')


example file name: /work/abslab/AVFP/denoised/110/sub-110_run-3_AVFP_denoised_novideoregs.nii.gz 



In [6]:
# load list of subjects included in our analyses:
included_data = pd.read_csv(base_dir + 'fmri_info/included_avfp_subjects.csv', header=None)
subIDs = included_data[0].astype('str').tolist()
print(subIDs)

['100', '103', '104', '105', '106', '107', '108', '109', '111', '112', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '127', '128', '130', '131', '132', '134', '135', '136', '137', '138', '139', '140', '142', '143', '144', '145', '146', '149', '150', '151', '152', '153', '154', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '169', '170', '171', '172', '174', '175', '176', '177', '179', '181', '182', '183', '184', '185', '186']


In [7]:
len(subIDs)

71

In [8]:
# behavioural log files, containing one row per trial
files = glob.glob(logfiles_dir + '*.txt')
files = [[i for i in files if 'sub_' + s in i] for s in subIDs]
n_files = len(files)

if subs != 'All': #if using a subset
    log_files = files[:subs]
else: #use all files
    log_files = files[:]
print('\nusing',len(log_files),'logfiles out of',n_files,'\n')


using 20 logfiles out of 71 



In [9]:
# mask_file = mask_dir + 'NTFA/masks/' + mask_type + '_memorygroup_mask_N' + str(n_files) + '_downsampled.nii.gz'
mask_file = mask_dir + 'gm_mask_icbm152_brain.nii.gz'

In [10]:
# define database filename
if subs != 'All':
    #AVFP_FILE = 'data/AVFP_NTFA_memory_N' + str(n_files) + '_subsetN' + str(subs) + '_' + mask_type + 'mask.tar'
    AVFP_FILE = base_dir + f'data/downsampled_test/AVFP_NTFA_N{n_files}_subsetN{subs}_{mask_type}mask_{condition}.tar'
else: #including all subjects
    AVFP_FILE = base_dir + f'data/AVFP_NTFA_N{n_files}_{subs}_{mask_type}mask_{condition}.tar'
print('\nSaving database to:',AVFP_FILE)

tar_file = AVFP_FILE
sink = wds.TarWriter(tar_file)


Saving database to: /work/abslab/Yiyu/NTFA_AVFP/data/downsampled_test/AVFP_NTFA_N71_subsetN20_GMmask.tar


In [11]:
html_name = AVFP_FILE.split('.')[0] + '.html'
%store html_name

Stored 'html_name' (str)


In [12]:
AVFP_FILE.split('.')[0]

'/work/abslab/Yiyu/NTFA_AVFP/data/downsampled_test/AVFP_NTFA_N71_subsetN20_GMmask'

# Functions to create labels in database

In [13]:
# set up functions to read task files:
class TaskElement:
    def __init__(self, task, start, end, run, fear_rating=None):
        def round_off_time(t):
            if t is not None:
                if task != 'rest':
                    return round((t + TASK_ONSET_DELAY) / TASK_TR) #assumes task onsets are in seconds
                else:
                    return round(t)
            else:
                return None
        self.task = task
        self.start_time = round_off_time(start)
        self.end_time = round_off_time(end)
        self.run = run
        self.fear_rating = fear_rating

        
def parse_task_lines(lines, headers):
    study = 'AVFP'
    for (i, line) in enumerate(lines):
        cols = line.split(' ')
        task = cols[int(np.where(headers == 'video_name')[0])]
        mem = cols[int(np.where(log_file_headers == 'novel_vs_familiar')[0])]
        if mem == '1':
            mem_type = 'New'
        else: mem_type = 'Old'
        task = f'{mem_type}_{task[:-4]}_{study}'
        
        start_time = float(cols[int(np.where(headers == 'video_onset')[0])])
        end_time = float(cols[int(np.where(headers == 'video_offset')[0])])
        run = int(cols[int(np.where(headers == 'run_number')[0])])
        fear_rating = abs(float(cols[int(np.where(headers == 'fear_rating')[0])]))
        if np.isnan(fear_rating): # if didn't move the slider at all
            fear_rating = .5 #middle
        yield TaskElement(task, start_time, end_time, run, fear_rating)

        
def rest_tasks(tasks):
    yield TaskElement('rest', 0, tasks[0].start_time - 1, tasks[0].run)
    for i in range(1, len(tasks)):
        rest_start = tasks[i-1].end_time + 1
        rest_end = tasks[i].start_time - 1
        if tasks[i].run == tasks[i-1].run:
            yield TaskElement('rest', rest_start, rest_end, tasks[i].run)
        else:
            yield TaskElement('rest', rest_start, None, tasks[i-1].run)
            yield TaskElement('rest', 0, rest_end, tasks[i].run)
    yield TaskElement('rest', tasks[-1].end_time + 1, None, tasks[-1].run)

In [14]:
def read_tasks(task_csv, headers):
    def sentinel(f):
        return f if f is not None else 0.0
    subject = int(task_csv.split('.txt')[0][-3:])
    logging.info('Subject %d', subject)   
    
    with open(task_csv, 'r') as task_csv_file:
        
        
        
        task_lines = list(parse_task_lines(task_csv_file.readlines(), headers))        
        task_lines += list(rest_tasks(task_lines))
        rest_lines = [r for r in task_lines if r.task == 'rest']
        rest_lines = sorted(rest_lines, key=lambda t: sentinel(t.run))
        rest_starts_dict = {key: [] for key in range(3, 6)} # runs 3,4,5 of AVFP
        rest_ends_dict = {key: [] for key in range(3, 6)}
        for (i,rest) in enumerate(rest_lines):
            if rest.end_time is not None and rest.start_time is not None:
                rest_ends_dict[rest.run].append(rest.end_time)
                rest_starts_dict[rest.run].append(rest.start_time)
        task_lines = sorted(task_lines, key=lambda t: sentinel(t.start_time))
        for (i, task) in enumerate(task_lines):
            if any(ele in task.task for ele in exclude_tasks):
                continue
            logging.info('Saving %s, run %d: started at %f, ended at %f',
                         task.task, task.run, sentinel(task.start_time), sentinel(task.end_time))
            result = niidb.FMriActivationBlock(zscore=True, zscore_by_rest=True)
            result.subject = subject
            result.task = task.task
            result.run = task.run
            result.start_time = task.start_time
            result.end_time = task.end_time
            result.rest_start_times = rest_starts_dict[result.run]
            result.rest_end_times = rest_ends_dict[result.run]
            result.individual_differences = {'fear_rating': task.fear_rating}
            yield result

# Create database!!

In [None]:
# create the database:
# note that the other blocks (rest) include all TRs between the videos

total_trs = 0
metadata = {
    'blocks': []
}
block_id = 0
for text_file in log_files:
    for block in read_tasks(text_file[0], log_file_headers):
        block.filename = nifti_filename(block.subject, block.run)
        block.rest_end_times = '[' + ', '.join(map(str, block.rest_end_times)) + ']'
        block.rest_start_times = '[' + ', '.join(map(str, block.rest_start_times)) + ']'
        block.block = block_id
        block_id += 1
        block.mask = mask_file
        block.smooth = 6
        block.load()
        metadata['blocks'].append(block.wds_metadata())

        for vals in block.format_wds():
            sink.write(vals)
        block_trs = (block.end_time - block.start_time)
        total_trs += block_trs



In [None]:
metadata['voxel_locations'] = block.locations
metadata['num_times'] = total_trs
torch.save(metadata, tar_file + '.meta')
logging.info('Recorded metadata, including voxel locations')
sink.close()

In [None]:
avfp_db = tardb.FmriTarDataset(AVFP_FILE)
avfp_db.mean_block(save=True)
avfp_db.normalize_activations(save=True)

In [None]:
logging.info('Finished building TarDb out of AVFP runs 3, 4, 5 dataset in %s',AVFP_FILE)