In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from notebooks.imports import *

import scipy.io as sio
import h5py

import hdf5storage

### Load Configs

In [2]:
from config import dir_config, main_config

raw_dir = Path(dir_config.data.raw)
processed_dir = Path(dir_config.data.processed)


### Utils functions

In [3]:
def determine_choice(row):
    if row['is_valid']:
        if row['outcome']:
            return row['target']
        else:
            return 'left' if row['target'] == 'right' else 'right'
    else:
        return np.NaN

def get_prior_condition(df):
    valid_df = df[df['is_valid']].copy()  # Ensure valid_df is a copy to avoid SettingWithCopyWarning

    # Calculate trial counts and percentages for each condition within valid trials
    condition_counts = valid_df.groupby(['target', 'color']).size().reset_index(name='counts')
    total_counts = condition_counts.groupby('color')['counts'].transform('sum')
    condition_counts['percentage'] = (condition_counts['counts'] / total_counts) * 100

    # Filter conditions meeting the 60% criterion
    conditions_met = condition_counts[(condition_counts['percentage'] > 60)].copy()  # Make a copy to safely modify

    # Prepare the output based on conditions met
    if not conditions_met.empty:
        # Use .loc to modify 'condition' column safely
        conditions_met.loc[:, 'condition'] = conditions_met.apply(lambda x: 'gr' if x['target'] == "right" and x['color'] == "green"
                                                                else ('gl' if x['target'] == "left" and x['color'] == "green"
                                                                        else ('rr' if x['target'] == "right" and x['color'] == "red"
                                                                            else 'rl')), axis=1)
        return conditions_met[['condition', 'target', 'color']].values.tolist()[0]
    else:
        return ['eq', -1, -1]

### Raw Data Column Description
#### Codes
    - 1001              Start trial
    - 2500              Fixation point ON
    - 2000              Targets appears (white choice cue for correct)
    - 2009              Distractor appears (white choice cue for wrong)
    - 4000:4001     Target (correct choice) is left (4000) or right (4001)
    - 4100:4199     Difficulty levels or coherence levels (4100= easiest)
    - 5000              Glass pattern appears
    - 5500              Glass pattern disappears
    - 5001              (invalid trial)	Failed to hold fixation
    - 5004              (invalid trial)	Failed to hold target
    - 5005              (invalid trial)	Anticipatory saccade
    - 5006              Chose distractor (wrong choice)
    - 5007              Failed to respond on time
    - 5510              Correct choice
    - 1503              The actual key press
    - 6101:6102     Glass pattern color (6101: green)
##### Events (starting with one)
    - 3rd column: GP orientation 
    - 4th column: % coherence {4100:100; 4101:35; 4102:13; 4103:0}
    - 5th column: GP color
    - 9th column: correct? 
##### Time (starting with one)
    - Reaction Time: 8th column - 7th column

## Compiling data from all subjects

In [4]:
fragment_data_path = raw_dir / 'harvard_data' / 'pd'
assert fragment_data_path.exists(), f"Fragment data path {fragment_data_path} does not exist."

# Get folder names
folders = [f for f in fragment_data_path.iterdir() if f.is_dir()]
folders

[PosixPath('/mnt/pd-data/raw/harvard_data/pd/P8_on'),
 PosixPath('/mnt/pd-data/raw/harvard_data/pd/P8_off'),
 PosixPath('/mnt/pd-data/raw/harvard_data/pd/P5_off'),
 PosixPath('/mnt/pd-data/raw/harvard_data/pd/P2_off'),
 PosixPath('/mnt/pd-data/raw/harvard_data/pd/P5_on'),
 PosixPath('/mnt/pd-data/raw/harvard_data/pd/P2_on')]

In [5]:
import numpy as np
import pandas as pd
from pathlib import Path
import hdf5storage

def process_sessions(folders, raw_dir, should_save=False):
	for idx, folder in enumerate(folders):
		trial_path = folder / 'trial'
		files = sorted(trial_path.glob('*.mat'))

		if not files:
			print(f"No .mat files found in {trial_path}")
			continue

		session_data = {"event": [], "time": []}

		# Load and concatenate data
		for file in files:
			temp_data = hdf5storage.loadmat(file.as_posix())
			session_data["event"].append(temp_data['event'])
			session_data["time"].append(temp_data['time'])

		session_data["event"] = np.concatenate(session_data["event"])
		session_data["time"] = np.concatenate(session_data["time"])

		# Create DataFrame
		event = session_data['event']
		df = pd.DataFrame({
			'color': np.select([event[:, 4] == 6101, event[:, 4] == 6102], ['green', 'red'], default=np.NaN),
			'coherence': np.select(
				[event[:, 3] == 4100, event[:, 3] == 4101, event[:, 3] == 4102, event[:, 3] == 4103],
				[100, 35, 13, 0],
				default=np.NaN
			),
			'target': np.select([event[:, 2] == 4000, event[:, 2] == 4001], ['left', 'right'], default=np.NaN),
		})

		# Mark invalid trials
		invalid_trials = np.where(
			(event[:, 8] == 5007) | (event[:, 7] == 5005) | (event[:, 7] == 0) | (event[:, 7] == 5008)
		)[0]
		df['is_valid'] = True
		df.loc[invalid_trials, 'is_valid'] = False

		# Determine outcome
		df['outcome'] = np.nan
		df.loc[np.where(event[:, 8] == 5510)[0], 'outcome'] = 1
		df.loc[np.where(event[:, 7] == 5006)[0], 'outcome'] = 0

		# Determine choice
		df['choice'] = df.apply(determine_choice, axis=1)

		# Calculate reaction time
		df['reaction_time'] = session_data['time'][:, 7] - session_data['time'][:, 6]

		# Infer prior condition
		prior, _, _ = get_prior_condition(df)

		if should_save:
			# Generate save name
			parts = folder.name.split('_')
			if len(parts) < 2:
				print(f"Folder name {folder.name} does not follow expected format.")
				continue

			savename = f"H{parts[0][1:].capitalize()}_{parts[1].upper()}meds_{prior}"

			# Save .mat file
			save_path = raw_dir / 'harvard_data' / 'pd' / f"{savename}.mat"
			save_path.parent.mkdir(parents=True, exist_ok=True)

			hdf5storage.savemat(save_path.as_posix(), session_data, format='7.3')
			print(f"Saved: {save_path.name}")
		else:
			return df


In [14]:
fragment_data_path = raw_dir / 'harvard_data' / 'pd'
assert fragment_data_path.exists(), f"Fragment data path {fragment_data_path} does not exist."

# Get folder names
folders = [f for f in fragment_data_path.iterdir() if f.is_dir()]

# Now call the function
process_sessions(folders, raw_dir)

Unnamed: 0,color,coherence,target,is_valid,outcome,choice,reaction_time
0,red,0.0,right,False,,,1670.214164
1,red,100.0,left,True,1.0,left,992.601647
2,green,100.0,left,True,1.0,left,866.981356
3,red,35.0,left,True,1.0,left,970.51648
4,green,35.0,left,True,0.0,right,1349.344004
5,green,35.0,left,True,1.0,left,918.273553
6,green,100.0,left,True,1.0,left,922.652001
7,red,35.0,left,True,1.0,left,1002.155455
8,red,0.0,left,True,1.0,left,971.325036
9,green,13.0,left,True,0.0,right,1112.180283


In [91]:
import numpy as np
import pandas as pd
from pathlib import Path
import hdf5storage

def process_practice_sessions(folders, raw_dir, should_save=False, session_type= 'practice'):

	for idx, folder in enumerate(folders):
		trial_path = folder / session_type
		files = sorted(trial_path.glob('*.mat'))

		if not files:
			# print(f"No .mat files found in {trial_path}")
			continue

		session_data = {"event": [], "time": []}

		# Load and concatenate data
		for file in files:
			temp_data = hdf5storage.loadmat(file.as_posix())
			if temp_data['event'].shape[0] > 5:
				session_data["event"].append(temp_data['event'])
				session_data["time"].append(temp_data['time'])

		session_data["event"] = np.concatenate(session_data["event"])
		session_data["time"] = np.concatenate(session_data["time"])

		# Create DataFrame
		event = session_data['event']
		df = pd.DataFrame({
			'color': np.select([event[:, 4] == 6101, event[:, 4] == 6102], ['green', 'red'], default=np.NaN),
			'coherence': np.select(
				[event[:, 3] == 4100, event[:, 3] == 4101, event[:, 3] == 4102, event[:, 3] == 4103],
				[100, 35, 13, 0],
				default=np.NaN
			),
			'target': np.select([event[:, 2] == 4000, event[:, 2] == 4001], ['left', 'right'], default=np.NaN),
		})

		# Mark invalid trials
		invalid_trials = np.where(
			(event[:, 8] == 5007) | (event[:, 7] == 5005) | (event[:, 7] == 0) | (event[:, 7] == 5008)
		)[0]
		df['is_valid'] = True
		df.loc[invalid_trials, 'is_valid'] = False

		# Determine outcome
		df['outcome'] = np.nan
		df.loc[np.where(event[:, 8] == 5510)[0], 'outcome'] = 1
		df.loc[np.where(event[:, 7] == 5006)[0], 'outcome'] = 0

		# Determine choice
		df['choice'] = df.apply(determine_choice, axis=1)

		# Calculate reaction time
		df['reaction_time'] = session_data['time'][:, 7] - session_data['time'][:, 6]

		# Infer prior condition
		prior, _, _ = get_prior_condition(df)

		return df

In [185]:
fragment_data_path = raw_dir / 'harvard_data' / 'pd'
# fragment_data_path = raw_dir / 'harvard_data' / 'hc'
assert fragment_data_path.exists(), f"Fragment data path {fragment_data_path} does not exist."

session_type = 'trial'

# Get folder names
folders = [f for f in fragment_data_path.iterdir() if f.is_dir()]

for idx, folder in enumerate(sorted(folders)):
	df  = process_practice_sessions([folder], raw_dir, session_type=session_type, should_save=False)
	if df is not None:
		# print(f"Processed folder {folder.name} with {len(df)} trials.")
		overall_accuracy = df['outcome'].sum() / len(df) * 100
		valid_accuracy = df[df.is_valid]['outcome'].mean() * 100
		accu_on_100 = df[(df.is_valid) & (df['coherence'] == 100)]['outcome'].mean() * 100

		print(f"Subject: {folder.name}, Valid Trials: {df.is_valid.sum()} / {len(df)}, 100% coherence Accuracy Rate: {accu_on_100:.2f}%,  Overall Accuracy: {overall_accuracy:.2f}%")


Subject: P25_off, Valid Trials: 161 / 824, 100% coherence Accuracy Rate: 63.51%,  Overall Accuracy: 10.68%
Subject: P25_on, Valid Trials: 242 / 724, 100% coherence Accuracy Rate: 54.55%,  Overall Accuracy: 16.44%
Subject: P26_off, Valid Trials: 605 / 733, 100% coherence Accuracy Rate: 49.02%,  Overall Accuracy: 44.20%
Subject: P26_on, Valid Trials: 626 / 651, 100% coherence Accuracy Rate: 53.90%,  Overall Accuracy: 53.92%
Subject: P27_off, Valid Trials: 621 / 646, 100% coherence Accuracy Rate: 33.12%,  Overall Accuracy: 42.72%
Subject: P27_on, Valid Trials: 623 / 643, 100% coherence Accuracy Rate: 56.05%,  Overall Accuracy: 50.08%


In [129]:
from scipy.io import loadmat
def extract_creation_date(mat_header):
    """Extract creation date from MATLAB .mat file header."""
    header_str = mat_header.decode('utf-8') if isinstance(mat_header, bytes) else mat_header
    if 'Created on:' in header_str:
        creation_info = header_str.split('Created on:')[1].strip()
        return creation_info
    else:
        return "Creation date not found."

def load_mat_file(file_path):
	try:
		data = loadmat(file_path, squeeze_me=True, struct_as_record=False)
		return extract_creation_date(data['__header__'])
	except Exception as e:
		print(f"Error loading {file_path}: {e}")
		return None


Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/practice: 250318_095525.mat, Creation Date: Mon Apr 28 08:48:23 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/practice: 250318_095103.mat, Creation Date: Tue Mar 18 09:55:51 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/practice: 250318_095304.mat, Creation Date: Fri Feb 28 09:31:21 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/trial: 250318_095718.mat, Creation Date: Tue Mar 18 10:06:36 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/trial: 250318_101912.mat, Creation Date: Tue Mar 18 10:29:43 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/trial: 250318_103057.mat, Creation Date: Tue Mar 18 10:40:11 2025
Metadata for /mnt/pd-data/raw/harvard_data/pd/P2_off/trial: 250318_100850.mat, Creation Date: Tue Mar 18 10:18:15 2025


In [146]:
import datetime
from scipy.io import loadmat  # Make sure this import is available

def extract_length_from_mat(filepath):
	"""Extracts data length from a MATLAB .mat file."""
	data = loadmat(filepath)
	# Exclude meta-entries (those starting with '__')
	for key in data:
		if not key.startswith("__"):
			try:
				return len(data[key])
			except TypeError:
				continue
	return "N/A"

for folder_name in ['P8_on', 'P8_off', 'P2_on', 'P2_off', 'P5_on', 'P5_off']:
	base_dir = raw_dir / 'harvard_data' / 'pd' / folder_name
	files = sorted(list(base_dir.rglob('*.mat')))

	def get_filename_timestamp(filename):
		return datetime.datetime.strptime(filename.split(".")[0], "%y%m%d_%H%M%S")

	def get_time_difference(filename_time, creation_time_str):
		creation_time = datetime.datetime.strptime(creation_time_str, "%a %b %d %H:%M:%S %Y")
		return creation_time - filename_time

	print("-" * 120)
	print(f"Processing folder: {folder_name}")
	print("-" * 120)
	print(f"{'Folder':<25} {'Length':<8} {'Filename':<30} {'Creation Date':<25} {'Time Difference'}")
	print("-" * 120)

	files = sorted(files, key=lambda x: x.name)
	for file in files:
		creation_date = load_mat_file(file.as_posix())  # Assumes this extracts the 'Created on' date
		filename_time = get_filename_timestamp(file.name)
		time_diff = get_time_difference(filename_time, creation_date)
		data_length = extract_length_from_mat(file.as_posix())

		print(f"{str(file.parent.name):<25} {str(data_length):<8} {file.name:<30} {creation_date:<25} {time_diff}")


------------------------------------------------------------------------------------------------------------------------
Processing folder: P8_on
------------------------------------------------------------------------------------------------------------------------
Folder                    Length   Filename                       Creation Date             Time Difference
------------------------------------------------------------------------------------------------------------------------
practice                  52       250314_121809.mat              Mon Apr 28 10:11:20 2025  44 days, 21:53:11
practice                  53       250314_121934.mat              Fri Mar 14 12:22:27 2025  0:02:53
practice                  54       250314_122015.mat              Mon Apr 28 10:15:21 2025  44 days, 21:55:06
practice                  53       250314_122212.mat              Fri Mar 14 12:22:27 2025  0:00:15
trial                     161      250314_122343.mat              Fri Mar 14 12:30:4

In [157]:
import datetime
from scipy.io import loadmat

def extract_length_from_mat(filepath):
	data = loadmat(filepath)
	for key in data:
		if not key.startswith("__"):
			try:
				return len(data[key])
			except TypeError:
				continue
	return "N/A"

def get_filename_timestamp(filename):
	return datetime.datetime.strptime(filename.split(".")[0], "%y%m%d_%H%M%S")

# Get yesterday's date
yesterday = datetime.date.today() - datetime.timedelta(days=1)

# Collect data
entries = []
base_dir = raw_dir / 'harvard_data' / 'pd'
files = list(base_dir.rglob('*.mat'))

for file in files:
	creation_str = load_mat_file(file.as_posix())  # e.g., "Tue Apr 23 10:06:36 2025"
	creation_dt = datetime.datetime.strptime(creation_str, "%a %b %d %H:%M:%S %Y")

	if creation_dt.date() == yesterday:
		data_length = extract_length_from_mat(file.as_posix())
		entries.append((
			file.parent.parent.name,  # Parent folder (e.g., P8_off)
			file.parent.name,         # Subfolder (e.g., trial)
			data_length,
			file.name,
			creation_str,
			creation_dt
		))

# Sort by creation datetime
entries.sort(key=lambda x: x[5])  # Sort by creation_dt

# Print header
print("-" * 130)
print(f"{'Parent Folder':<20} {'Subfolder':<20} {'Length':<8} {'Filename':<30} {'Creation Date':<25}")
print("-" * 130)

# Print sorted entries
for parent, subfolder, length, filename, creation_str, _ in entries:
	print(f"{parent:<20} {subfolder:<20} {str(length):<8} {filename:<30} {creation_str:<25}")


----------------------------------------------------------------------------------------------------------------------------------
Parent Folder        Subfolder            Length   Filename                       Creation Date            
----------------------------------------------------------------------------------------------------------------------------------
P5_off               practice             50       250401_065823.mat              Mon Apr 28 08:48:23 2025 
P2_off               practice             50       250318_095525.mat              Mon Apr 28 08:48:23 2025 
P8_on                practice             52       250314_121809.mat              Mon Apr 28 10:11:20 2025 
P8_on                practice             54       250314_122015.mat              Mon Apr 28 10:15:21 2025 
P5_off               practice             51       250401_070015.mat              Mon Apr 28 10:24:01 2025 
P5_on                practice             55       250415_063733.mat              Mon Apr 

In [162]:
dir = raw_dir / 'harvard_data' / 'pd'

data = load_mat_file(dir / 'P8_on' / 'practice' / '250314_121934.mat')
data2 = load_mat_file(dir / 'P8_on' / 'practice' / '250314_122212.mat')

print(f"P8_on practice 250314_121934 and 250314_122212 are same: {data == data2}")


data = load_mat_file(dir / 'P5_off' / 'practice' / '250401_065823.mat')
data2 = load_mat_file(dir / 'P2_off' / 'practice' / '250318_095525.mat')

print(f"P5_off practice 250401_065823 and p2_off 250318_095525 are same: {data == data2}")

P8_on practice 250314_121934 and 250314_122212 are same: True
P5_off practice 250401_065823 and p2_off 250318_095525 are same: True


50 trials in 13 secs P5_on
Yesterday's date in p5_on