In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from notebooks.imports import *

import scipy.io as sio
import h5py

import hdf5storage

### Load Configs

In [2]:
from config import dir_config, main_config

raw_dir = Path(dir_config.data.raw)
processed_dir = Path(dir_config.data.processed)


### Utils functions

In [3]:
def determine_choice(row):
    if row['is_valid']:
        if row['outcome']:
            return row['target']
        else:
            return 'left' if row['target'] == 'right' else 'right'
    else:
        return np.NaN

def get_prior_condition(df):
    valid_df = df[df['is_valid']].copy()  # Ensure valid_df is a copy to avoid SettingWithCopyWarning

    # Calculate trial counts and percentages for each condition within valid trials
    condition_counts = valid_df.groupby(['target', 'color']).size().reset_index(name='counts')
    total_counts = condition_counts.groupby('color')['counts'].transform('sum')
    condition_counts['percentage'] = (condition_counts['counts'] / total_counts) * 100

    # Filter conditions meeting the 60% criterion
    conditions_met = condition_counts[(condition_counts['percentage'] > 60)].copy()  # Make a copy to safely modify

    # Prepare the output based on conditions met
    if not conditions_met.empty:
        # Use .loc to modify 'condition' column safely
        conditions_met.loc[:, 'condition'] = conditions_met.apply(lambda x: 'gr' if x['target'] == "right" and x['color'] == "green"
                                                                else ('gl' if x['target'] == "left" and x['color'] == "green"
                                                                        else ('rr' if x['target'] == "right" and x['color'] == "red"
                                                                            else 'rl')), axis=1)
        return conditions_met[['condition', 'target', 'color']].values.tolist()[0]
    else:
        return ['eq', -1, -1]

### Raw Data Column Description
#### Codes
    - 1001              Start trial
    - 2500              Fixation point ON
    - 2000              Targets appears (white choice cue for correct)
    - 2009              Distractor appears (white choice cue for wrong)
    - 4000:4001     Target (correct choice) is left (4000) or right (4001)
    - 4100:4199     Difficulty levels or coherence levels (4100= easiest)
    - 5000              Glass pattern appears
    - 5500              Glass pattern disappears
    - 5001              (invalid trial)	Failed to hold fixation
    - 5004              (invalid trial)	Failed to hold target
    - 5005              (invalid trial)	Anticipatory saccade
    - 5006              Chose distractor (wrong choice)
    - 5007              Failed to respond on time
    - 5510              Correct choice
    - 1503              The actual key press
    - 6101:6102     Glass pattern color (6101: green)
##### Events (starting with one)
    - 3rd column: GP orientation 
    - 4th column: % coherence {4100:100; 4101:35; 4102:13; 4103:0}
    - 5th column: GP color
    - 9th column: correct? 
##### Time (starting with one)
    - Reaction Time: 8th column - 7th column

## Compiling data from all subjects

In [4]:
reject_subject = main_config['moca_rejection']

In [5]:
subject_metadata = pd.read_csv(raw_dir / 'session_metadata.csv')
subjects_from_metadata = subject_metadata['subject_id'].unique()

# get all .mat files from the raw data directory
raw_mat_files = list(Path(raw_dir).glob("*.mat"))
subjects_from_data_file = [f.stem.split("_")[0] for f in raw_mat_files]

In [6]:
# remove rejected subjects from subjects_from_metadata and subjects_from_data_file
subjects_from_metadata = [s for s in subjects_from_metadata if s not in reject_subject]
subjects_from_data_file = [s for s in subjects_from_data_file if s not in reject_subject]

In [7]:
# look for differences between the two sets
print(f"Metadata contains extra: {set(subjects_from_metadata) - set(subjects_from_data_file)}")
print(f"Data files contain extra: {set(subjects_from_data_file) - set(subjects_from_metadata)}")
print(len(set(subjects_from_data_file)), len(set(subjects_from_metadata)))

assert len(set(subjects_from_data_file)) == len(set(subjects_from_metadata)), "Mismatch between metadata and data files"

Metadata contains extra: set()
Data files contain extra: set()
40 40


In [8]:
aggregate_df_list = []  # Use a list to collect DataFrames

for session_file in raw_mat_files:
    session_data = hdf5storage.loadmat(str(session_file))

    df = pd.DataFrame({
        'color': np.select([session_data['event'][:, 4] == 6101, session_data['event'][:, 4] == 6102], ['green', 'red'], default=np.NaN),
        'coherence': np.select([session_data['event'][:, 3] == 4100, session_data['event'][:, 3] == 4101, session_data['event'][:, 3] == 4102, session_data['event'][:, 3] == 4103], [100, 35, 13, 0], default=np.NaN),
        'target': np.select([session_data['event'][:, 2] == 4000, session_data['event'][:, 2] == 4001], ['left', 'right'], default=np.NaN)
    })

    invalid_trials = np.sort(np.where((session_data['event'][:, 8] == 5007) | (session_data['event'][:, 7] == 5005) | (session_data['event'][:, 7] == 0) | (session_data['event'][:, 7] == 5008))[0])
    df['is_valid'] = True
    df.loc[invalid_trials, 'is_valid'] = False

    df['outcome'] = False
    df.loc[np.where(session_data['event'][:, 8] == 5510)[0], 'outcome'] = True

    df['choice'] = df.apply(determine_choice, axis=1)
    df['reaction_time'] = session_data['time'][:, 7] - session_data['time'][:, 6]

    df['prior'], df['prior_direction'], df['prior_color'] = get_prior_condition(df)

    df['subject_id'] = session_file.name.split('_')[0]
    df['medication'] = session_file.name.split('_')[-2]
    df['medication'] = df['medication'].apply(lambda x: x[:-4].lower())
    df['session_name'] = session_file.name

    aggregate_df_list.append(df)  # Append DataFrame to the list

# Concatenate all DataFrames in the list at once
aggregate_df = pd.concat(aggregate_df_list, ignore_index=True)

# replace empty strings with NaN
aggregate_df.replace('', np.nan, inplace=True)


In [9]:
# rearrange columns
aggregate_df = aggregate_df[['subject_id', 'medication', 'prior', 'prior_direction', 'prior_color', 'color', 'coherence', 'target', 'is_valid', 'outcome', 'choice', 'reaction_time', "session_name"]]

In [10]:
aggregate_df.to_csv(Path(raw_dir, 'aggregate_data.csv'), index=False)