# 2021 NMA Butterfly Unicorns Project Team

Anjali Srinivasan, Aaditya Prasad, Zac Wheeler

In [1]:
# imports
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

# for visualization
!pip install nilearn --quiet
from nilearn import plotting, datasets

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


## Data Retrieval

In [2]:
# The download cells will store the data in nested directories starting here:
HCP_DIR = "./hcp"
if not os.path.isdir(HCP_DIR):
  os.mkdir(HCP_DIR)

# The data shared for NMA projects is a subset of the full HCP dataset
N_SUBJECTS = 339

# The data have already been aggregated into ROIs from the Glasesr parcellation
N_PARCELS = 360

# The acquisition parameters for all tasks were identical
TR = 0.72  # Time resolution, in sec

# The parcels are matched across hemispheres with the same order
HEMIS = ["Right", "Left"]

# Each experiment was repeated multiple times in each subject
N_RUNS_REST = 4
N_RUNS_TASK = 2

# Time series data are organized by experiment, with each experiment
# having an LR and RL (phase-encode direction) acquistion
BOLD_NAMES = [
  "rfMRI_REST1_LR", "rfMRI_REST1_RL",
  "rfMRI_REST2_LR", "rfMRI_REST2_RL",
  "tfMRI_MOTOR_RL", "tfMRI_MOTOR_LR",
  "tfMRI_WM_RL", "tfMRI_WM_LR",
  "tfMRI_EMOTION_RL", "tfMRI_EMOTION_LR",
  "tfMRI_GAMBLING_RL", "tfMRI_GAMBLING_LR",
  "tfMRI_LANGUAGE_RL", "tfMRI_LANGUAGE_LR",
  "tfMRI_RELATIONAL_RL", "tfMRI_RELATIONAL_LR",
  "tfMRI_SOCIAL_RL", "tfMRI_SOCIAL_LR"
]

# You may want to limit the subjects used during code development.
# This will use all subjects:
subjects = range(N_SUBJECTS)

### Data Download

In [3]:
fname = "hcp_task.tgz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/s4h8j/download/
  !tar -xzf $fname -C $HCP_DIR --strip-components=1

tar: subjects/307/EVs/tfMRI_SOCIAL_RL: Cannot mkdir: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/._tfMRI_LANGUAGE_LR: Cannot open: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/tfMRI_LANGUAGE_LR: Cannot mkdir: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/._tfMRI_WM_RL: Cannot open: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/tfMRI_WM_RL: Cannot mkdir: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/._tfMRI_WM_LR: Cannot open: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/tfMRI_WM_LR: Cannot mkdir: No such file or directory
tar: subjects/307/EVs: Cannot mkdir: Disk quota exceeded
tar: subjects/307/EVs/._tfMRI_SOCIAL_LR: Cannot open: No 

In [4]:
fname = "hcp_covariates.tgz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/x5p4g/download/
  !tar -xzf $fname -C $HCP_DIR --strip-components=1

hcp_covariates.tgz: Disk quota exceeded
tar (child): hcp_covariates.tgz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [5]:
fname = f"{HCP_DIR}/atlas.npz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/j5kuc/download

./hcp/atlas.npz: Disk quota exceeded


### Region Info

In [6]:
regions = np.load(f"{HCP_DIR}/regions.npy").T
region_info = dict(
    name=regions[0].tolist(),
    network=regions[1],
    myelin=regions[2].astype(np.float),
)

#print(len(region_info["name"]))
#print(len(region_info["network"]))
#print(len(region_info["myelin"]))

#print(region_info["name"])
#print(region_info["network"])
#print()

#names = region_info["name"]
#networks = region_info["network"]

#language_areas = []

#for i in range(len(networks)):
#  if networks[i] == "Auditory":
#    print(i, names[i])

#posterior-mu = posterior multimodal

In [7]:
with np.load(f"{HCP_DIR}/atlas.npz") as dobj:
  atlas = dict(**dobj)

FileNotFoundError: [Errno 2] No such file or directory: './hcp/atlas.npz'

## Helper Functions (For Data Loading)

get_image_ids(name)

In [None]:
def get_image_ids(name):
  """Get the 1-based image indices for runs in a given experiment.

    Args:
      name (str) : Name of experiment ("rest" or name of task) to load
    Returns:
      run_ids (list of int) : Numeric ID for experiment image files

  """
  run_ids = [
    i for i, code in enumerate(BOLD_NAMES, 1) if name.upper() in code
  ]
  if not run_ids:
    raise ValueError(f"Found no data for '{name}''")
  return run_ids

load_timeseries(subject, name, runs=None, concat=True, remove_mean=True)

In [None]:
def load_timeseries(subject, name, runs=None, concat=True, remove_mean=True):
  """Load timeseries data for a single subject.
  
  Args:
    subject (int): 0-based subject ID to load
    name (str) : Name of experiment ("rest" or name of task) to load
    run (None or int or list of ints): 0-based run(s) of the task to load,
      or None to load all runs.
    concat (bool) : If True, concatenate multiple runs in time
    remove_mean (bool) : If True, subtract the parcel-wise mean

  Returns
    ts (n_parcel x n_tp array): Array of BOLD data values

  """
  # Get the list relative 0-based index of runs to use
  if runs is None:
    runs = range(N_RUNS_REST) if name == "rest" else range(N_RUNS_TASK)
  elif isinstance(runs, int):
    runs = [runs]

  # Get the first (1-based) run id for this experiment 
  offset = get_image_ids(name)[0]

  # Load each run's data
  bold_data = [
      load_single_timeseries(subject, offset + run, remove_mean) for run in runs
  ]

  # Optionally concatenate in time
  if concat:
    bold_data = np.concatenate(bold_data, axis=-1)

  return bold_data


load_single_timeseries(subject, bold_run, remove_mean=True)

In [None]:
def load_single_timeseries(subject, bold_run, remove_mean=True):
  """Load timeseries data for a single subject and single run.
  
  Args:
    subject (int): 0-based subject ID to load
    bold_run (int): 1-based run index, across all tasks
    remove_mean (bool): If True, subtract the parcel-wise mean

  Returns
    ts (n_parcel x n_timepoint array): Array of BOLD data values

  """
  bold_path = f"{HCP_DIR}/subjects/{subject}/timeseries"
  bold_file = f"bold{bold_run}_Atlas_MSMAll_Glasser360Cortical.npy"
  ts = np.load(f"{bold_path}/{bold_file}")
  if remove_mean:
    ts -= ts.mean(axis=1, keepdims=True)
  return ts

load_evs(subject, name, condition)

In [None]:
def load_evs(subject, name, condition):
  """Load EV (explanatory variable) data for one task condition.

  Args:
    subject (int): 0-based subject ID to load
    name (str) : Name of task
    condition (str) : Name of condition

  Returns
    evs (list of dicts): A dictionary with the onset, duration, and amplitude
      of the condition for each run.

  """
  evs = []
  for id in get_image_ids(name):
    task_key = BOLD_NAMES[id - 1]
    ev_file = f"{HCP_DIR}/subjects/{subject}/EVs/{task_key}/{condition}.txt"
    ev_array = np.loadtxt(ev_file, ndmin=2, unpack=True)
    ev = dict(zip(["onset", "duration", "amplitude"], ev_array))
    evs.append(ev)
  return evs

condition_frames(run_evs, skip=0)

In [None]:
def get_frames_for_evs(run_evs, skip=0):
  """Identify timepoints corresponding to a given condition in each run.

  Args:
    run_evs (list of dicts) : Onset and duration of the event, per run
    skip (int) : Ignore this many frames at the start of each trial, to account
      for hemodynamic lag

  Returns:
    frames_list (list of 1D arrays): Flat arrays of frame indices, per run

  """
  frames_list = []
  for ev in run_evs:
    print("ev: ", ev)

    # Determine when trial starts, rounded down
    start = np.floor(ev["onset"] / TR).astype(int)

    # Use trial duration to determine how many frames to include for trial
    # TR = 0.72  # Time resolution, in sec
    duration = np.ceil(ev["duration"] / TR).astype(int)

    # Take the range of frames that correspond to this specific trial
    frames = [s + np.arange(skip, d) for s, d in zip(start, duration)]

    frames_list.append(frames)

  return frames_list

selective_average(timeseries_data, ev, skip=0)

In [None]:
def selective_averages(timeseries_data, ev, skip=0):
  """Take the temporal mean across frames for a given condition.

  Args:
    timeseries_data (array or list of arrays): n_parcel x n_tp arrays
    ev (dict or list of dicts): Condition timing information
    skip (int) : Ignore this many frames at the start of each trial, to account
      for hemodynamic lag

  Returns:(
    selected_data: has shape: 2, 360, [number of trial runs] and contains the 
    average for each trial run

  """
  # Ensure that we have lists of the same length
  if not isinstance(timeseries_data, list):
    timeseries_data = [timeseries_data]
  if not isinstance(ev, list):
    ev = [ev]
  if len(timeseries_data) != len(ev):
    raise ValueError("Number of `timeseries_data` and `ev` objects given must match.")

  # Identify the indices of relevant frames
  frames_list = get_frames_for_evs(ev, skip)
  print("frames_list[0]: ", end=": ")
  print(frames_list[0])
  print(len(frames_list[0]))

  # Select the frames from each image
  # timeseries_data has shape: 2, 360, 316 (we already indexed by subject before passing it in)
  # selected_data has shape: 2, 360, [number of trial runs] and contains the average for each trial run
  selected_data = np.empty((2,360,len(frames_list[0])))
  
  # RL?
  # each measurement direction
  for rl in range(2):
    # each brain parcel
    for parcel in range(360):
      run_count = 0
      # each trial run
      for run in frames_list[0]:
        window_sum = 0
        # each BOLD signal measurement in the trial run
        for measurement_index in run:
          window_sum = window_sum + timeseries_data[rl][parcel][measurement_index]
        selected_data[rl][parcel][run_count] = window_sum/len(run)
        run_count = run_count + 1
      #print("parcel:", parcel, selected_data[rl][parcel])
  
  return selected_data

## Task Analysis

### timeseries_task

In [None]:
timeseries_task = []

# timeseries_task format: subject, then LR/RL, then parcel, then time
# 339, 2, 360, 316
for subject in subjects:
  timeseries_task.append(load_timeseries(subject, "language", concat=False))

In [None]:
#print(np.array(timeseries_task).shape)
#print(timeseries_task[0])
#print()
#print(timeseries_task[0][0][0][0])

### Task Descriptions

MOTOR:&nbsp;cue,&nbsp;lf,&nbsp;lh,&nbsp;rf,&nbsp;rh,&nbsp;t

WM:&nbsp;&nbsp;&nbsp;&nbsp;0bk_body,&nbsp;0bk_faces,&nbsp;0bk_nir,&nbsp;0bk_placed,&nbsp;0bk_tools,&nbsp;&nbsp;2bk_body,&nbsp;2bk_faces,&nbsp;2bk_nir,&nbsp;2bk_placed,&nbsp;2bk_tools,&nbsp;0bk_cor,&nbsp;0bk_err,&nbsp;&nbsp;2bk_cor,&nbsp;2bk_err,&nbsp;&nbsp;all_bk_cor,&nbsp;all_bk_err

EMOTION:&nbsp;feat,&nbsp;neutral

GAMBLING:&nbsp;loss,&nbsp;loss_event,&nbsp;win,&nbsp;win_event,&nbsp;neut_event

LANGUAGE:&nbsp;cue, math,&nbsp;story, present_math,&nbsp;present_story,&nbsp;question_math,&nbsp;question_story,&nbsp;response_math,&nbsp;response_story

RELATIONAL:&nbsp;error,&nbsp;match,&nbsp;relation

SOCIAL:&nbsp;mental_resp,&nbsp;mental,&nbsp;other_resp,&nbsp;rnd

### Task Data Retrieval

In [None]:
def get_region_avgs_for_cond(subject, task, cond):
  avgs = []
  # get a list of dicts describing the active time windows based on the subject and task/condition
  evs = load_evs(subject, task, cond)
  # get array of the average BOLD data w/ an avg for each trial, 
  # in each region, and one for each of RL/LR
  avgs = selective_averages(timeseries_task[subject], evs)
  return avgs

task = "language"
subject = 0
shift = 6

avgs_qm = []  #question math
avgs_qs = []  #question story

avgs_qm = get_region_avgs_for_cond(subject, task, "question_math")
avgs_qs = get_region_avgs_for_cond(subject, task, "question_story")

print("question math: ")
print(avgs_qm)

print("question story: ")
print(avgs_qs)

'''
for subject in subjects:
  print(subject)
  # Get the average signal in each region for each condition

  # format: condition, then LR/RL (direction), then dictionary w/ trials
  # 4 trials for story, 9 trials for math
  evs = [load_evs(subject, task, cond) for cond in conditions]
  
  # conditions, 
  avgs = [selective_average(timeseries_task[subject], ev) for ev in evs]
  ## BOLD data timeseries: /V\M\----/\/\/
  ## EV data:              -------~~|****|
  ##                              ^ onset
  ##                               **** = duration
  ## skip = 2 = ~~

  
  # add subject average to avg arrays
  #avg_qs = np.append(avg_qs, avgs[0])
  #avg_qm = np.append(avg_qm, avgs[1])
  #avg_rs = np.append(avg_rs, avgs[2])
  #avg_rm = np.append(avg_rm, avgs[3])

#print(len(avgs[2][1]))
#print(evs[0][0]['onset'])
#print(np.array(avgs).shape)
'''

ev:  {'onset': array([ 33.167,  46.122,  57.357, 142.776, 154.051, 164.273, 174.643,
       186.904, 198.579, 210.668, 223.276]), 'duration': array([3.606, 3.724, 3.714, 3.616, 3.129, 3.076, 3.514, 3.667, 2.913,
       3.603, 3.52 ]), 'amplitude': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
ev:  {'onset': array([ 36.599,  49.887,  93.309, 106.664, 147.794, 160.109, 173.383,
       219.084]), 'duration': array([3.503, 3.97 , 3.741, 3.064, 3.269, 3.774, 3.716, 3.972]), 'amplitude': array([1., 1., 1., 1., 1., 1., 1., 1.])}
frames_list[0]: : [array([46, 47, 48, 49, 50, 51]), array([64, 65, 66, 67, 68, 69]), array([79, 80, 81, 82, 83, 84]), array([198, 199, 200, 201, 202, 203]), array([213, 214, 215, 216, 217]), array([228, 229, 230, 231, 232]), array([242, 243, 244, 245, 246]), array([259, 260, 261, 262, 263, 264]), array([275, 276, 277, 278, 279]), array([292, 293, 294, 295, 296, 297]), array([310, 311, 312, 313, 314])]
11
ev:  {'onset': array([ 19.413,  83.747, 108.017, 129.674]

"\nfor subject in subjects:\n  print(subject)\n  # Get the average signal in each region for each condition\n\n  # format: condition, then LR/RL (direction), then dictionary w/ trials\n  # 4 trials for story, 9 trials for math\n  evs = [load_evs(subject, task, cond) for cond in conditions]\n  \n  # conditions, \n  avgs = [selective_average(timeseries_task[subject], ev) for ev in evs]\n  ## BOLD data timeseries: /V\\M\\----/\\/\\/\n  ## EV data:              -------~~|****|\n  ##                              ^ onset\n  ##                               **** = duration\n  ## skip = 2 = ~~\n\n  \n  # add subject average to avg arrays\n  #avg_qs = np.append(avg_qs, avgs[0])\n  #avg_qm = np.append(avg_qm, avgs[1])\n  #avg_rs = np.append(avg_rs, avgs[2])\n  #avg_rm = np.append(avg_rm, avgs[3])\n\n#print(len(avgs[2][1]))\n#print(evs[0][0]['onset'])\n#print(np.array(avgs).shape)\n"

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0cfc2388-b22c-424d-b321-d09bf45f57a9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>