# NMA Project 2021 - Butterfly Unicorns

Zac Wheeler, Anjali Srinivasan and Aaditya Prasad

## Imports

In [115]:
# imports
import os
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats



# for visualization -- this needs to come before pycaret
# pycaret installs newer dependencies that are not backward-compatible
#!pip install nilearn --quiet
#from nilearn import plotting, datasets
#!pip uninstall pycaret -y
# pycaret -- 2.3.2 doesn't seem to always work
# 2.1.0 seems too old, 2.2.0 sometimes works...
!pip install -U pycaret --quiet

## Data Retrieval

In [138]:
# The download cells will store the data in nested directories starting here:
HCP_DIR = "./hcp"
if not os.path.isdir(HCP_DIR):
  os.mkdir(HCP_DIR)

# The data shared for NMA projects is a subset of the full HCP dataset
N_SUBJECTS = 339

# The data have already been aggregated into ROIs from the Glasser parcellation
N_PARCELS = 360

# The acquisition parameters for all tasks were identical
TR = 0.72  # Time resolution, in sec

# The parcels are matched across hemispheres with the same order
HEMIS = ["Right", "Left"]

# Each experiment was repeated multiple times in each subject
N_RUNS_REST = 4
N_RUNS_TASK = 2

# Time series data are organized by experiment, with each experiment
# having an LR and RL (phase-encode direction) acquistion
BOLD_NAMES = [
  "rfMRI_REST1_LR", "rfMRI_REST1_RL",
  "rfMRI_REST2_LR", "rfMRI_REST2_RL",
  "tfMRI_MOTOR_RL", "tfMRI_MOTOR_LR",
  "tfMRI_WM_RL", "tfMRI_WM_LR",
  "tfMRI_EMOTION_RL", "tfMRI_EMOTION_LR",
  "tfMRI_GAMBLING_RL", "tfMRI_GAMBLING_LR",
  "tfMRI_LANGUAGE_RL", "tfMRI_LANGUAGE_LR",
  "tfMRI_RELATIONAL_RL", "tfMRI_RELATIONAL_LR",
  "tfMRI_SOCIAL_RL", "tfMRI_SOCIAL_LR"
]

# You may want to limit the subjects used during code development.
# This will use all subjects:
subjects = range(N_SUBJECTS)

PARCEL_IDX = [10, 45, 49, 94, 95, 115, 116, 126, 135, 136, 142, 145, 225, 229, 274, 275, 295, 296, 306, 315, 316, 322, 325]

### Data Download

In [3]:
fname = "hcp_task.tgz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/s4h8j/download/
  !tar -xzf $fname -C $HCP_DIR --strip-components=1

In [4]:
fname = "hcp_covariates.tgz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/x5p4g/download/
  !tar -xzf $fname -C $HCP_DIR --strip-components=1

In [5]:
fname = f"{HCP_DIR}/atlas.npz"
if not os.path.exists(fname):
  !wget -qO $fname https://osf.io/j5kuc/download

### Region Info

In [6]:
regions = np.load(f"{HCP_DIR}/regions.npy").T
region_info = dict(
    name=regions[0].tolist(),
    network=regions[1],
    myelin=regions[2].astype(np.float),
)


#print(region_info)
#print(len(region_info["name"]))
#print(len(region_info["network"]))
#print(len(region_info["myelin"]))

#print(region_info["name"])
#print(region_info["network"])
#print()

#names = region_info["name"]
#networks = region_info["network"]

#language_areas = []

#for i in range(len(networks)):
#  if networks[i] == "Auditory":
#    print(i, names[i])

#posterior-mu = posterior multimodal

In [7]:
with np.load(f"{HCP_DIR}/atlas.npz") as dobj:
  atlas = dict(**dobj)

## Helper Functions (for Data Loading)

get_image_ids(name)

In [8]:
def get_image_ids(name):
  """Get the 1-based image indices for runs in a given experiment.

    Args:
      name (str) : Name of experiment ("rest" or name of task) to load
    Returns:
      run_ids (list of int) : Numeric ID for experiment image files

  """
  run_ids = [
    i for i, code in enumerate(BOLD_NAMES, 1) if name.upper() in code
  ]
  if not run_ids:
    raise ValueError(f"Found no data for '{name}''")
  return run_ids

load_timeseries(subject, name, runs=None, concat=True, remove_mean=True)

In [9]:
def load_timeseries(subject, name, runs=None, concat=True, remove_mean=True):
  """Load timeseries data for a single subject.
  
  Args:
    subject (int): 0-based subject ID to load
    name (str) : Name of experiment ("rest" or name of task) to load
    run (None or int or list of ints): 0-based run(s) of the task to load,
      or None to load all runs.
    concat (bool) : If True, concatenate multiple runs in time
    remove_mean (bool) : If True, subtract the parcel-wise mean

  Returns
    ts (n_parcel x n_tp array): Array of BOLD data values

  """
  # Get the list relative 0-based index of runs to use
  if runs is None:
    runs = range(N_RUNS_REST) if name == "rest" else range(N_RUNS_TASK)
  elif isinstance(runs, int):
    runs = [runs]

  # Get the first (1-based) run id for this experiment 
  offset = get_image_ids(name)[0]

  # Load each run's data
  bold_data = [
      load_single_timeseries(subject, offset + run, remove_mean) for run in runs
  ]

  # Optionally concatenate in time
  if concat:
    bold_data = np.concatenate(bold_data, axis=-1)

  return bold_data


load_single_timeseries(subject, bold_run, remove_mean=True)

In [10]:
def load_single_timeseries(subject, bold_run, remove_mean=True):
  """Load timeseries data for a single subject and single run.
  
  Args:
    subject (int): 0-based subject ID to load
    bold_run (int): 1-based run index, across all tasks
    remove_mean (bool): If True, subtract the parcel-wise mean

  Returns
    ts (n_parcel x n_timepoint array): Array of BOLD data values

  """
  bold_path = f"{HCP_DIR}/subjects/{subject}/timeseries"
  bold_file = f"bold{bold_run}_Atlas_MSMAll_Glasser360Cortical.npy"
  ts = np.load(f"{bold_path}/{bold_file}")
  if remove_mean:
    ts -= ts.mean(axis=1, keepdims=True)
  return ts

load_evs(subject, name, condition)

In [11]:
def load_evs(subject, name, condition):
  """Load EV (explanatory variable) data for one task condition.

  Args:
    subject (int): 0-based subject ID to load
    name (str) : Name of task
    condition (str) : Name of condition

  Returns
    evs (list of dicts): A dictionary with the onset, duration, and amplitude
      of the condition for each run.

  """
  evs = []
  for id in get_image_ids(name):
    task_key = BOLD_NAMES[id - 1]
    ev_file = f"{HCP_DIR}/subjects/{subject}/EVs/{task_key}/{condition}.txt"
    ev_array = np.loadtxt(ev_file, ndmin=2, unpack=True)
    ev = dict(zip(["onset", "duration", "amplitude"], ev_array))
    evs.append(ev)
  return evs

get_frames_for_evs(run_evs, skip=0)

In [12]:
def get_frames_for_evs(run_evs, skip=0, drop_frames=0):
  """Identify timepoints corresponding to a given condition in each run.

  Args:
    run_evs (list of dicts) : Onset and duration of the event, per run
    skip (int) : Ignore this many frames at the start of each trial, to account
      for hemodynamic lag

  Returns:
    frames_list (list of 1D arrays): Flat arrays of frame indices, per run

  """
  print("using lag of ", skip)
  print("using drop_frames of ", drop_frames)
  frames_list = []
  for ev in run_evs:
    #print("ev: ", ev)

    # Determine when trial starts, rounded down
    start = np.floor(ev["onset"] / TR).astype(int)

    # Use trial duration to determine how many frames to include for trial
    # TR = 0.72  # Time resolution, in sec
    duration = np.ceil(ev["duration"] / TR - drop_frames).astype(int)

    # Take the range of frames that correspond to this specific trial
    # Modified this so instead of dropping/skipping all frames in skip, 
    # we also extend the measurement by half as many as we dropped
    frames = [s + np.arange(skip, d) for s, d in zip(start, duration+skip)]

    frames_list.append(frames)

  return frames_list

selective_averages(timeseries_data, ev, skip=0)

In [13]:
def selective_averages(timeseries_data, ev, skip=0, drop_frames=0):
  """Take the temporal mean across frames for a given condition.

  Args:
    timeseries_data (array or list of arrays): n_parcel x n_tp arrays
    ev (dict or list of dicts): Condition timing information
    skip (int) : Ignore this many frames at the start of each trial, to account
      for hemodynamic lag

  Returns:(
    selected_data: has shape: 2, 360, [number of trial runs] and contains the 
    average for each trial run

  """
  # Ensure that we have lists of the same length
  if not isinstance(timeseries_data, list):
    timeseries_data = [timeseries_data]
  if not isinstance(ev, list):
    ev = [ev]
  if len(timeseries_data) != len(ev):
    raise ValueError("Number of `timeseries_data` and `ev` objects given must match.")

  # Identify the indices of relevant frames
  frames_list = get_frames_for_evs(ev, skip=skip, drop_frames=drop_frames)
  #print("frames_list[0]: ", end=": ")
  print(frames_list[0])
  #print("frames_list[1]: ", end=": ")
  print(frames_list[1])
  #print(len(frames_list[0]))

  # Select the frames from each image
  # timeseries_data has shape: 2, 360, 316 (we already indexed by subject before passing it in)
  # selected_data has shape: 2, 360, [number of trial runs] and contains the average for each trial run
  # print(len(timeseries_data[0][0]))
  # print(len(timeseries_data[1][0]))

  selected_data = np.empty((2,360,), dtype=object)
  selected_data.fill([])
  
  # RL, then LR
  # each measurement direction
  for rl in range(2):
    #break_out = 0
    # each brain parcel
    for parcel in range(360):
      run_count = 0
      # each trial run
      for run in frames_list[rl]:
        window_sum = 0
        # each BOLD signal measurement in the trial run
        for measurement_index in run:
          if measurement_index < len(timeseries_data[rl][parcel]):
            #print("out of range: ", end="")
            #print(rl, end = " ")
            #print(frames_list)
            #print()
            #print(run)
            #print(measurement_index)
            #break_out = 1
            #break
            #raise Exception("measurement index out of range - too high")
            window_sum = window_sum + timeseries_data[rl][parcel][measurement_index]
          else:
            window_sum = sys.maxsize
        if not window_sum == sys.maxsize:
          selected_data[rl][parcel] = np.append(selected_data[rl][parcel], window_sum/len(run))
        run_count = run_count + 1

      #print("parcel:", parcel, selected_data[rl][parcel])
      
  
  
  return selected_data

test_concat(list1, list2, concat_list)

In [14]:
def test_concat(list1, list2, concat_list):

  for i in range(len(list1)):
    if not list1[i] == concat_list[i]:
      raise Exception("bad concat: list1")
  
  for j in range(len(list1), len(list1)+len(list2)):
    if not list2[j - len(list1)] == concat_list[j]:
      raise Exception("bad concat: list2")

get_region_avgs_for_cond(subject, task, cond)

In [31]:
def get_region_avgs_for_cond(subject, task, cond, lag=0, drop_frames=0):
  avgs = []
  # get a list of dicts describing the active time windows based on the subject and task/condition
  evs = load_evs(subject, task, cond)
  # get array of the average BOLD data w/ an avg for each trial, 
  # in each region, and one for each of RL/LR
  avgs = selective_averages(timeseries_task[subject], evs, skip=lag, drop_frames=drop_frames)
  return avgs

concat_lr_rl(avgs_arr)

In [16]:
def concat_lr_rl(avgs_arr):
  # concats in order: rl, then lr (yes this name is backwards)

  avgs_arr_reshaped = np.moveaxis(avgs_arr, 0, 1)
  #print(avgs_qm_reshaped.shape)
  #print(list(avgs_qm_reshaped[0][0]))
  avgs_arr_new = [None]*360

  #print(avgs_qm_reshaped[5][0])
  #print("+")
  #print(avgs_qm_reshaped[1][1])
  #print("=")
  concat_list = list(avgs_arr_reshaped[0][0]) + list(avgs_arr_reshaped[0][1])
  test_concat(list(avgs_arr_reshaped[0][0]), list(avgs_arr_reshaped[0][1]), concat_list)

  concat_list = list(avgs_arr_reshaped[124][0]) + list(avgs_arr_reshaped[124][1])
  test_concat(list(avgs_arr_reshaped[124][0]), list(avgs_arr_reshaped[124][1]), concat_list)

  concat_list = list(avgs_arr_reshaped[223][0]) + list(avgs_arr_reshaped[223][1])
  test_concat(list(avgs_arr_reshaped[223][0]), list(avgs_arr_reshaped[223][1]), concat_list)
  #print(len(concat_list))
  
  for i in range(360):
    new_values = list(avgs_arr_reshaped[i][0]) + list(avgs_arr_reshaped[i][1])
    avgs_arr_new[i] = new_values
    #print(new_values)

  return avgs_arr_new

get_model_data(task, subtask_q, subtask_r, subject, parcels=PARCEL_IDX)

In [17]:
def get_model_data(task, subtask_q, subtask_r, subject, parcels=PARCEL_IDX, lag=0, drop_frames=0):

  # avg arrays (avg parcel activation for each parcel per trial per subtask)
  avgs_question = get_region_avgs_for_cond(subject, task, subtask_q, lag=lag, drop_frames=drop_frames)
  avgs_response = get_region_avgs_for_cond(subject, task, subtask_r, lag=lag, drop_frames=drop_frames)
  #avgs_qs = get_region_avgs_for_cond(subject, task, qs)
  #avgs_rs = get_region_avgs_for_cond(subject, task, rs)

  #print(np.array(qm_concat).shape)
  #print("here")

  #print(len(avgs_qm))
  #print(len(avgs_qm[0][1]))
  #print(len(avgs_qm[1][1]))
  #for i in range(360):
  #print(avgs_qm[0][0])
  #print(avgs_qm[1][0])
  #print()

  # combine directions for each average
  # after this, 360 x num trials total
  question_concat = concat_lr_rl(avgs_question)
  response_concat = concat_lr_rl(avgs_response)
  #qs_concat = concat_lr_rl(avgs_qs)
  #rs_concat = concat_lr_rl(avgs_rs)

  #print(question_concat[PARCEL_IDX[0]])

  #print(np.array(question_concat).shape)
  #print(np.array(response_concat).shape)
  #print()

  #print(qm_concat[0])
  #print(np.array(qm_concat).shape)
  #print()
  #for i in range(len(qm_concat)):
  #  print(len(qm_concat[i]))
  #print()

  #filter out the parcels we don't want
  question_concat = [question_concat[i] for i in PARCEL_IDX]
  response_concat = [response_concat[i] for i in PARCEL_IDX]
  #qs_concat = [qs_concat[i] for i in PARCEL_IDX]
  #rs_concat = [rs_concat[i] for i in PARCEL_IDX]

  #print(question_concat[0])

  #print(np.array(question_concat).shape)
  #print(np.array(response_concat).shape)
  #print()

  # put in format of 360 parcels per trial (1st dim = trial)
  question_reshaped = np.moveaxis(question_concat, 0, 1)
  response_reshaped = np.moveaxis(response_concat, 0, 1)
  #qs_concat = np.moveaxis(qs_concat, 0, 1)
  #rs_concat = np.moveaxis(rs_concat, 0, 1)

  #print(np.array(qm_concat).shape)
  #print(np.array(rm_concat).shape)
  #print()

  X = list(question_reshaped) + list(response_reshaped)
  y = [0]*question_reshaped.shape[0] + [1]*response_reshaped.shape[0]

  return X, y


## Task Analysis

timeseries_task

In [18]:
timeseries_task = []

# timeseries_task format: subject, then LR/RL, then parcel, then time
# 339, 2, 360, 316
for subject in subjects:
  timeseries_task.append(load_timeseries(subject, "language", concat=False))

In [19]:
#print(np.array(timeseries_task).shape)
#print(timeseries_task[0])
#print()
#print(timeseries_task[0][0][0][0])

### Task Descriptions

- MOTOR: cue, lf, lh, rf, rh, t
- WM:
    0bk_body, 0bk_faces, 0bk_nir, 0bk_placed, 0bk_tools, 
    2bk_body, 2bk_faces, 2bk_nir, 2bk_placed, 2bk_tools,
    0bk_cor, 0bk_err,
    2bk_cor, 2bk_err,
    all_bk_cor, all_bk_err
- EMOTION: feat, neutral
- GAMBLING: loss, loss_event, win, win_event, neut_event
- LANGUAGE:
    cue,
    math, story
    present_math, present_story,
    question_math, question_story,
    response_math, response_story
- RELATIONAL: error, match, relation
- SOCIAL: mental_resp, mental, other_resp, rnd

### Task Data Retrieval

In [20]:
task = "language"
qm = "question_math"
rm = "response_math"
qs = "question_story"
rs = "response_story"
shift = 6

#avgs_qm = []  #question math
#avgs_rm = []  #response math
#avgs_all = []

#avgs_qm = get_region_avgs_for_cond(subject, task, "question_math")
#avgs_rm = get_region_avgs_for_cond(subject, task, "response_math")
#avgs_qs = get_region_avgs_for_cond(subject, task, "question_story")

#print(np.array(avgs_qm).shape)
#print("question math: ")
#print(avgs_qm)
#print(avgs_qm[0][0])
#print(avgs_qm[1][0])
#avgs_all = np.append(avgs_qm[0], avgs_qm[1])
#print(avgs_all.shape)

#print(np.array(avgs_qs).shape)
#print("question story: ")
#print(avgs_qs)


#for subject in subjects:
#  print(subject)
  # Get the average signal in each region for each condition

  # format: condition, then LR/RL (direction), then dictionary w/ trials
  # 4 trials for story, 9 trials for math
#  evs = [load_evs(subject, task, cond) for cond in conditions]
  
  # conditions, 
 # avgs = [selective_average(timeseries_task[subject], ev) for ev in evs]
  ## BOLD data timeseries: /V\M\----/\/\/
  ## EV data:              -------~~|****|
  ##                              ^ onset
  ##                               **** = duration
  ## skip = 2 = ~~

  
  # add subject average to avg arrays
  #avg_qs = np.append(avg_qs, avgs[0])
  #avg_qm = np.append(avg_qm, avgs[1])
  #avg_rs = np.append(avg_rs, avgs[2])
  #avg_rm = np.append(avg_rm, avgs[3])

#print(len(avgs[2][1]))
#print(evs[0][0]['onset'])
#print(np.array(avgs).shape)

In [21]:


scores_arr = []

#KEEP THIS!!!!
'''
for subject in subjects:
  #print(subject)

  X_math, y_math = get_model_data(task, qm, rm, subject)
  X_story, y_story = get_model_data(task, qs, rs, subject)
'''

  #model_math = svm.SVC(kernel='rbf')
  #scores = cross_val_score(model, X, y, cv=10)
  #scores_arr.append(scores.mean())


'''
  subject = 1

  # avg arrays (avg parcel activation for each parcel per trial per subtask)
  avgs_qm = get_region_avgs_for_cond(subject, task, qm)
  avgs_rm = get_region_avgs_for_cond(subject, task, rm)
  #avgs_qs = get_region_avgs_for_cond(subject, task, qs)
  #avgs_rs = get_region_avgs_for_cond(subject, task, rs)

  #print(np.array(qm_concat).shape)
  #print("here")

  #print(len(avgs_qm))
  #print(len(avgs_qm[0][1]))
  #print(len(avgs_qm[1][1]))
  #for i in range(360):
  #print(avgs_qm[0][0])
  #print(avgs_qm[1][0])
  #print()

  # combine directions for each average
  # after this, 360 x num trials total
  qm_concat = concat_lr_rl(avgs_qm)
  rm_concat = concat_lr_rl(avgs_rm)
  #qs_concat = concat_lr_rl(avgs_qs)
  #rs_concat = concat_lr_rl(avgs_rs)

  print(qm_concat[PARCEL_IDX[0]])

  print(np.array(qm_concat).shape)
  print(np.array(rm_concat).shape)
  print()

  #print(qm_concat[0])
  #print(np.array(qm_concat).shape)
  #print()
  #for i in range(len(qm_concat)):
  #  print(len(qm_concat[i]))
  #print()

  #filter out the parcels we don't want
  qm_concat = [qm_concat[i] for i in PARCEL_IDX]
  rm_concat = [rm_concat[i] for i in PARCEL_IDX]
  #qs_concat = [qs_concat[i] for i in PARCEL_IDX]
  #rs_concat = [rs_concat[i] for i in PARCEL_IDX]

  print(qm_concat[0])

  print(np.array(qm_concat).shape)
  print(np.array(rm_concat).shape)
  print()

  # put in format of 360 parcels per trial (1st dim = trial)
  qm_concat = np.moveaxis(qm_concat, 0, 1)
  rm_concat = np.moveaxis(rm_concat, 0, 1)
  #qs_concat = np.moveaxis(qs_concat, 0, 1)
  #rs_concat = np.moveaxis(rs_concat, 0, 1)

  print(np.array(qm_concat).shape)
  print(np.array(rm_concat).shape)
  print()
'''

'''
  X = list(qm_concat) + list(rm_concat)
  y = [0]*qm_concat.shape[0] + [1]*rm_concat.shape[0]

  model = svm.SVC(kernel='rbf')
  scores = cross_val_score(model, X, y, cv=10)
  scores_arr.append(scores.mean())
  

  break
'''
#print(scores)


#avgs_qm = get_region_avgs_for_cond(subject, task, "question_math")
#avgs_rm = get_region_avgs_for_cond(subject, task, "response_math")


'''
avgs_qm_reshaped = np.moveaxis(avgs_qm, 0, 1)
#print(avgs_qm_reshaped.shape)
#print(list(avgs_qm_reshaped[0][0]))
avgs_qm_new = [None]*360

#print(avgs_qm_reshaped[5][0])
#print("+")
#print(avgs_qm_reshaped[1][1])
#print("=")
concat_list = list(avgs_qm_reshaped[300][0]) + list(avgs_qm_reshaped[300][1])
test_concat(list(avgs_qm_reshaped[300][0]), list(avgs_qm_reshaped[300][1]), concat_list)
#print(len(concat_list))

for i in range(360):
    new_values = list(avgs_qm_reshaped[i][0]) + list(avgs_qm_reshaped[i][1])
    avgs_qm_new[i] = new_values
    #print(new_values)

'''
#print(avgs_qm_new)

'\navgs_qm_reshaped = np.moveaxis(avgs_qm, 0, 1)\n#print(avgs_qm_reshaped.shape)\n#print(list(avgs_qm_reshaped[0][0]))\navgs_qm_new = [None]*360\n\n#print(avgs_qm_reshaped[5][0])\n#print("+")\n#print(avgs_qm_reshaped[1][1])\n#print("=")\nconcat_list = list(avgs_qm_reshaped[300][0]) + list(avgs_qm_reshaped[300][1])\ntest_concat(list(avgs_qm_reshaped[300][0]), list(avgs_qm_reshaped[300][1]), concat_list)\n#print(len(concat_list))\n\nfor i in range(360):\n    new_values = list(avgs_qm_reshaped[i][0]) + list(avgs_qm_reshaped[i][1])\n    avgs_qm_new[i] = new_values\n    #print(new_values)\n\n'

## SVM Implementation

In [22]:
# imports
from pycaret.utils import enable_colab 
enable_colab()


Colab mode enabled.


In [23]:
from pycaret.classification import *
import pandas as pd

In [166]:
# test classifier

# number of frames to skip for hemodynamic lag
# 7 is ~five seconds (0.72 * 7), which is in the theory range (3-8 seconds) 
# and appears to provide peak accuracy for most subjects
# subject 0 has better results at 8 or 9, subject 300 has better results at 6 or even 5
lag = 0

# also changed the duration (deleting two frames) to capture only peak BOLD
# this could most likely be tuned further by using a value specific to each subtask, 
# or a fraction of total duration, but this is good enough for now
drop_frames = 2

subject=250
print("subject: ", subject)
print("# parcels used: ", len(PARCEL_IDX))
X, y = get_model_data(task, qm, rm, subject, lag=lag, drop_frames=drop_frames)

X=np.array(X)
y=np.array(y)
y=y.reshape(-1,1)
print(X.shape)
print(y.shape)

# TODO shouldn't this be range(len(PARCEL_IDX)) or maybe range(1,len(PARCEL_IDX) + 1) ?
col=[str(ele) for ele in range(24)]
print(col)

data=pd.DataFrame(np.hstack((X,y)),columns=col)
data.shape

#print(np.array(X).shape)
#print(np.array(list(X)+list(y)).shape)

#data=pd.DataFrame(np.vstack((X,y)))
#data.shape
#exp_clf101 = setup(data = pd.DataFrame(X), target = y)

#print(np.array(X).shape)
#print(np.array(y).shape)

subject:  250
# parcels used:  23
using lag of  0
using drop_frames of  2
[array([ 9, 10, 11]), array([28, 29, 30, 31]), array([127, 128, 129, 130]), array([148, 149, 150, 151]), array([168, 169, 170]), array([223, 224, 225]), array([287, 288, 289, 290]), array([307, 308, 309, 310])]
[array([50, 51, 52]), array([67, 68, 69]), array([126, 127, 128]), array([144, 145, 146]), array([201, 202, 203, 204]), array([220, 221, 222]), array([238, 239, 240, 241]), array([301, 302, 303])]
using lag of  0
using drop_frames of  2
[array([13, 14, 15]), array([33, 34, 35]), array([132, 133, 134]), array([153, 154, 155]), array([173, 174, 175]), array([228, 229, 230]), array([292, 293, 294]), array([312, 313, 314])]
[array([54, 55, 56]), array([72, 73, 74]), array([131, 132, 133]), array([148, 149, 150]), array([207, 208, 209]), array([225, 226, 227]), array([243, 244, 245]), array([306, 307, 308])]
(32, 23)
(32, 1)
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', 

(32, 24)

In [167]:
exp_clf101 = setup(data = data, target = '23', session_id=123, use_gpu=True, normalize=True, normalize_method='zscore') 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,23
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(32, 24)"
5,Missing Values,False
6,Numeric Features,23
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [168]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.7167,0.0,0.8,0.65,0.7,0.4,0.4,0.012
xgboost,Extreme Gradient Boosting,0.7167,0.65,0.7,0.6,0.6333,0.4,0.4,0.11
gbc,Gradient Boosting Classifier,0.7,0.8,0.6,0.5,0.5333,0.4,0.4,0.067
dt,Decision Tree Classifier,0.6833,0.675,0.75,0.6,0.65,0.35,0.35,0.014
ada,Ada Boost Classifier,0.6667,0.75,0.6,0.55,0.5667,0.32,0.3,0.106
catboost,CatBoost Classifier,0.6667,0.8,0.6,0.5,0.5333,0.3,0.3,1.027
ridge,Ridge Classifier,0.65,0.0,0.7,0.55,0.6,0.3,0.3,0.017
qda,Quadratic Discriminant Analysis,0.6,0.6,0.6,0.5,0.5333,0.22,0.2,0.014
lda,Linear Discriminant Analysis,0.6,0.55,0.65,0.5,0.5333,0.19,0.2,0.014
rf,Random Forest Classifier,0.5833,0.75,0.55,0.45,0.4667,0.14,0.15,0.872


In [169]:
#lr = create_model('lr')
#print(lr)
evaluate_model(best_model)



interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [170]:
#tuned_lr = tune_model(lr)
#print(tuned_lr)
best_model.score(X, y)

0.65625

In [171]:
predict_model(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.4,0.4,0.6,0.4286,0.5,-0.2,-0.2182


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,Label
0,0.407612,-1.024491,-1.534491,-0.164517,-0.245158,-0.932778,-0.821413,-0.123086,0.389981,-1.957168,-0.542336,-1.718593,-0.411533,-1.138551,-1.776783,0.598879,2.06808,0.646789,1.579501,1.621711,0.552102,-1.108623,0.22127,0.0,1.0
1,1.381435,1.39649,-0.217556,0.579932,0.754819,0.599138,0.174535,0.573931,0.083307,2.613533,-0.011945,0.723194,-0.384988,-0.453924,-0.197469,3.211073,0.415166,0.31061,-1.47637,-0.352365,-1.866972,-0.958568,-1.082199,1.0,0.0
2,-1.955678,-0.712676,-0.121994,-0.253515,-0.255986,-0.745227,-1.685146,-0.253336,-0.407055,-2.313488,-1.528552,-0.853669,-0.773163,-1.099319,-0.531334,0.737648,1.290651,-0.128578,-1.385276,-0.421007,-1.301227,-1.681601,-0.343867,0.0,1.0
3,-0.25688,-0.236308,-1.085214,-1.155403,-0.685268,-0.186744,-0.346722,-2.696529,-1.027859,-1.149909,-0.914609,-2.276767,-0.747242,-0.589542,-0.299666,-1.527886,-0.411211,-0.27221,-0.424526,-1.371811,-1.671326,-1.496635,-1.630513,1.0,1.0
4,-0.90888,1.767151,1.248957,1.302075,2.183176,-1.420494,0.204328,1.859171,2.395314,-0.192206,3.357598,0.845271,0.989182,1.272943,1.340647,2.243853,1.861959,1.366816,2.553543,1.552845,2.749784,3.435971,1.949444,0.0,1.0
5,-0.320916,-0.430432,-0.715183,-0.086432,0.573455,0.089888,-0.101565,-0.868513,-0.911058,-0.689469,0.205033,0.687697,-0.547154,0.138274,0.807273,0.137341,1.200858,-1.071412,-0.303823,-0.836066,-1.270678,-0.630963,-0.674827,1.0,0.0
6,0.472045,-0.354169,-1.930118,-0.241469,-1.324555,-0.550842,-1.149227,0.418586,0.605652,0.87404,-0.168565,-0.868127,-0.727034,-1.147189,-0.817682,-0.464916,-0.882881,-1.316188,0.487755,0.766576,-0.445504,-0.070064,-0.993541,0.0,0.0
7,0.072672,-0.762729,0.05494,0.760972,-0.450885,-1.800229,-1.18564,-0.319723,-1.030318,-1.800382,-0.39486,-1.120332,0.057626,-1.118047,-0.596786,0.50868,-0.40787,0.600984,-0.288169,-0.625101,-0.492842,-0.56973,-0.414793,1.0,1.0
8,-1.268145,-0.433137,-1.419518,-0.460353,-1.014612,-0.269956,-1.425758,-1.407039,-0.359803,-0.43727,-0.662987,-1.613572,-1.095525,-0.937628,-0.348467,-0.242886,-0.718877,-1.335049,-1.814052,-2.833118,-1.462091,-0.571399,-1.454107,0.0,1.0
9,0.856901,-0.424535,-0.500531,-0.368958,0.542559,-0.113446,0.383464,0.464168,0.930841,0.156984,-0.21038,-2.397372,0.428362,-0.908808,-0.865482,0.829236,3.698584,1.90093,0.6674,3.115928,2.063271,-0.539947,0.615456,1.0,1.0


In [172]:

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.808
Model:                            OLS   Adj. R-squared:                  0.257
Method:                 Least Squares   F-statistic:                     1.465
Date:                Thu, 22 Jul 2021   Prob (F-statistic):              0.298
Time:                        03:53:25   Log-Likelihood:                 3.1931
No. Observations:                  32   AIC:                             41.61
Df Residuals:                       8   BIC:                             76.79
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7625      0.535     -1.426      0.1

In [29]:
'''
#question_concat = concat_lr_rl(avgs_qm)
#response_concat = concat_lr_rl(avgs_rm)


print(np.array(question_concat).shape)
print(np.array(response_concat).shape)
print()

question_concat = np.moveaxis(question_concat, 0, 1)
response_concat = np.moveaxis(response_concat, 0, 1)

#question_concat = np.reshape(question_concat, (19,360))
#response_concat = np.reshape(response_concat, (19,360))
print(question_concat.shape)
print(response_concat.shape)
print()

# dimensions: first by trials (all of question trials then all response trials)
#             then by parcel values
X = list(question_concat) + list(response_concat) # Design Matrix

#print(X)

#X = np.reshape(X, 2, 1)

print(np.array(X).shape)
#print(np.array(X[0][0]).shape)
#print(np.array(X[1][0]).shape)

# print(np.array(X[0][45]).shape)

# 0 is question
# 1 is response
# dimensions: first question then response, all parcels for both (720)

# label: what each trial is
y = [0]*question_concat.shape[0] + [1]*response_concat.shape[0]

print(len(y))
print(y)
'''

'\n#question_concat = concat_lr_rl(avgs_qm)\n#response_concat = concat_lr_rl(avgs_rm)\n\n\nprint(np.array(question_concat).shape)\nprint(np.array(response_concat).shape)\nprint()\n\nquestion_concat = np.moveaxis(question_concat, 0, 1)\nresponse_concat = np.moveaxis(response_concat, 0, 1)\n\n#question_concat = np.reshape(question_concat, (19,360))\n#response_concat = np.reshape(response_concat, (19,360))\nprint(question_concat.shape)\nprint(response_concat.shape)\nprint()\n\n# dimensions: first by trials (all of question trials then all response trials)\n#             then by parcel values\nX = list(question_concat) + list(response_concat) # Design Matrix\n\n#print(X)\n\n#X = np.reshape(X, 2, 1)\n\nprint(np.array(X).shape)\n#print(np.array(X[0][0]).shape)\n#print(np.array(X[1][0]).shape)\n\n# print(np.array(X[0][45]).shape)\n\n# 0 is question\n# 1 is response\n# dimensions: first question then response, all parcels for both (720)\n\n# label: what each trial is\ny = [0]*question_concat.s

In [30]:
# from sklearn.model_selection import cross_val_score
# clf = svm.SVC(kernel='linear', C=1)
# scores = cross_val_score(clf, X, Y, cv=10)
# print(scores)



model = svm.SVC(kernel='rbf')
scores = cross_val_score(model, X, y, cv=2)
print(scores)


'''
model_2 = DecisionTreeClassifier(random_state=0)
scores_2 = cross_val_score(model_2, X, y, cv=2)
print(scores_2)
'''

[0.58823529 0.52941176]


'\nmodel_2 = DecisionTreeClassifier(random_state=0)\nscores_2 = cross_val_score(model_2, X, y, cv=2)\nprint(scores_2)\n'