In [None]:
import os
import numpy as np
import pandas as pd
from fooof import FOOOFGroup
from fooof.analysis import get_band_peak_fg
import mne

In [None]:
## Directory paths
csv_load_path = '00_Subjects_Info'
epoch_load_path = '02_Epoch_EEG/Concatenated'
fooof_load_path = '03_FOOOF_EEG/Concatenated'

csv_save_path = '04_Features_EEG/Range_13_10_Sec_Epoch_Fixed.csv'
cleaned_csv_path = '04_Features_EEG/Range_13_10_Sec_Epoch_Fixed_Clean.csv'
merged_csv_path = '04_Features_EEG/Range_13_10_Sec_Epoch_Fixed_Merged.csv'
model_fit_plot_save_folder = '05_Plots/FOOOF_Fit'

In [None]:
# Subjects to include in the analysis
EEG_INFO = {
    'SUBJ_NUMS': ['10002', '10003', '10004', '10008', '10011', '10012', '10016', '10019', 
                  '10021', '10022', '10023', '10024', '10028', '10031', '10032', '10034', 
                  '10036', '10037', '10039', '10040', '10046', '10048', '10056', '10058'],
}
SUBJ_NUMS = EEG_INFO['SUBJ_NUMS']

# Frequency bands of interest (in Hz)
BANDS = {
    'Delta': (1, 4),
    'Theta': (4, 8),
    'Alpha': (8, 13)
}

# EEG channel names
CHANNELS = ['F7', 'Fp1', 'Fp2', 'F8', 'F3', 'Fz', 'F4', 'C3', 'Cz', 'P8', 'P7', 'Pz', 'P4', 'T3', 'P3', 'O1', 'O2', 'C4', 'T4']

# Channel groupings by anatomical region
All_Channel_indices = np.arange(len(CHANNELS))
Occipital_Channel_indices = [CHANNELS.index('O1'), CHANNELS.index('O2')]
Frontal_Channel_indices = [CHANNELS.index('F7'), CHANNELS.index('F3'), CHANNELS.index('Fz'), CHANNELS.index('F4'), CHANNELS.index('F8')]
Central_Channel_indices = [CHANNELS.index('Cz'), CHANNELS.index('C3'), CHANNELS.index('C4')]

# Dictionary of channel groups with their corresponding indices
Channel_Groups = {
    'All': All_Channel_indices,
    'Occipital': Occipital_Channel_indices, 
    'Frontal': Frontal_Channel_indices,
    'Central': Central_Channel_indices,
}

In [None]:
## Get FOOOF filenames
all_fooof_files = os.listdir(fooof_load_path)

# Filter files that end with '_fooof_results_fixed.json'
filtered_files = [file for file in all_fooof_files if file.endswith('_fooof_results_fixed.json')]

## Load .csv containing session information
info_fname = 'Subjects_Info.csv'
full_path = os.path.join(csv_load_path, info_fname)
session_info_data = pd.read_csv(full_path)

In [None]:
# Loop through each subject
for subj_idx, subj_num in enumerate(SUBJ_NUMS):
    # Filter files that start with the current subject number
    subj_files = [file for file in filtered_files if file.startswith(subj_num)]

    # Extract session information for each subject
    session_info_list = []
    for filename in subj_files:
        session_info = filename.split(subj_num + '_')[1].split('_fooof_results_fixed.json')[0]
        session_info_list.append(session_info)
    
    # Process each session for the current subject
    for sess_idx, sess_num in enumerate(session_info_list):
        try:
            full_sess_name = str(subj_num) + "_" + str(sess_num)
            params_grouped = {}  # Dictionary to store computed parameters
        
            # Load epoch .fif file
            epoch = mne.read_epochs(epoch_load_path + '/' + full_sess_name + '_epo.fif', preload=False)

            # Calculate duration of session
            start_time = epoch.times[0]
            end_time = epoch.times[-1]
            duration_seconds = end_time - start_time
            print("Duration of the epoch:", duration_seconds, "seconds")
            params_grouped['Duration_Secs'] = duration_seconds
        
            # Load FOOOF model results
            json_filename = os.path.join(fooof_load_path, f'{subj_num}_{sess_num}_fooof_results_fixed.json')
            print(json_filename)
            fg = FOOOFGroup()
            fg.load(json_filename)

            # Extract frequency and power spectrum density values
            freqs = fg.freqs
            psd = fg.power_spectra
        
            # Extract model fit parameters
            offset = fg.get_params('aperiodic_params', 'offset')
            exponent = fg.get_params('aperiodic_params', 'exponent')
        
            # Get R2 and Error values
            R2 = fg.get_params('r_squared')
            Error = fg.get_params('error')
        
            # Filter channels based on R2 threshold (>= 0.95)
            valid_indices = np.where(R2 >= 0.95)[0]
            invalid_indices = np.where(R2 < 0.95)[0]

            # Compute and store metrics for all channels
            All_R2 = np.nanmean(R2[valid_indices])
            All_Error = np.nanmean(Error[valid_indices])
            params_grouped['All_R2'] = All_R2
            params_grouped['All_Error'] = All_Error
            
            # Process each channel group
            for group_name, channel_indices in Channel_Groups.items():
                valid_channel_indices = [idx for idx in channel_indices if idx in valid_indices]
                
                # Compute group-level aperiodic parameters
                group_offset = np.nanmean([offset[idx] for idx in valid_channel_indices])
                group_exponent = np.nanmean([exponent[idx] for idx in valid_channel_indices])
                params_grouped[f"{group_name}_Offset"] = group_offset
                params_grouped[f"{group_name}_Exponent"] = group_exponent
            
                # Compute metrics for each frequency band
                for band_name, freq_range in BANDS.items():
                    # Compute aperiodic-corrected SNR
                    SNR = np.zeros(len(valid_channel_indices))
                    for i, chan_idx in enumerate(valid_channel_indices):
                        ap_fit = fg.get_fooof(chan_idx).get_params('aperiodic_params')
                        psd_corr = 10 * psd[chan_idx] - 10 * fg.get_fooof(chan_idx)._ap_fit
                        idx_band = np.where((freqs >= freq_range[0]) & (freqs <= freq_range[1]))[0]
                        SNR[i] = np.max(psd_corr[idx_band])
                    SNR_mean = np.nanmean(SNR)
                    params_grouped[f"{group_name}_{band_name}_SNR"] = SNR_mean
                    
                    # Compute power spectrum features
                    periodic_feats = get_band_peak_fg(fg, freq_range)
                    cf = np.nanmean(periodic_feats[valid_channel_indices, 0])
                    pw = np.nanmean(periodic_feats[valid_channel_indices, 1])
                    bw = np.nanmean(periodic_feats[valid_channel_indices, 2])
                    params_grouped[f"{group_name}_{band_name}_CF"] = cf
                    params_grouped[f"{group_name}_{band_name}_PW"] = pw    
                    params_grouped[f"{group_name}_{band_name}_BW"] = bw  
                    
            # Store individual channel exponents
            exponent[invalid_indices] = np.nan
            for i, channel in enumerate(CHANNELS):
                params_grouped[f"{channel}_Exponent"] = exponent[i]
            
            # Convert computed parameters to DataFrame
            df_params_grouped = pd.DataFrame([params_grouped])
            column_names = list(df_params_grouped.columns.values)
            
            # Update session info data with computed parameters
            subj_num_convert = session_info_data['Record ID'].dtype.type(subj_num)
            sess_num_convert = session_info_data['Event Name'].dtype.type(sess_num)
            index = session_info_data[(session_info_data['Record ID'] == subj_num_convert) &
                                      (session_info_data['Event Name'] == sess_num_convert)].index
            session_info_data.loc[session_info_data.index[index], column_names] = df_params_grouped.values

        except Exception as e:
            print(f"Error processing subject {subj_num}, session {sess_num}: {e}")
            continue
              
# Save the updated DataFrame back to the CSV file
session_info_data.to_csv(csv_save_path, index=False, mode='w+', header=True)

In [None]:
## Clean dataframe to include only the sessions with highest occipital alpha SNR when there are retries due to child not cooperating

# Create a copy of the original DataFrame to preserve the original data
df_copy = session_info_data.copy()

# Remove sessions where no EEG was recorded
# This is done by dropping rows where 'Occipital_Alpha_SNR' is NaN
df = df_copy.dropna(subset=['Occipital_Alpha_SNR'])

# Sort the DataFrame by 'Record ID', 'Timepoint', and 'Occipital_Alpha_SNR'
# - 'Record ID' and 'Timepoint' are sorted in ascending order (True)
# - 'Occipital_Alpha_SNR' is sorted in descending order (False) to have highest SNR first
df_sorted = df.sort_values(by=['Record ID', 'Timepoint', 'Occipital_Alpha_SNR'], 
                           ascending=[True, True, False])

# Drop duplicates, keeping only the first entry (highest 'Occipital_Alpha_SNR') 
# within each 'Record ID' + 'Timepoint' group
df_highest_snr_kept = df_sorted.drop_duplicates(subset=['Record ID', 'Timepoint'], keep='first')

# Perform a final check for any remaining duplicates
# This creates a boolean Series where True indicates a duplicate
remaining_duplicates_check = df_highest_snr_kept.duplicated(subset=['Record ID', 'Timepoint'], keep=False)

# Save the final cleaned DataFrame to a new CSV file
# 'index=False' prevents writing row indices to the CSV
# 'mode='w+'' opens the file for reading and writing, creating it if it doesn't exist
df_highest_snr_kept.to_csv(cleaned_csv_path, index=False, mode='w+')

# Create a dictionary with the cleaning results
# This includes the path to the cleaned CSV and whether any duplicates remain
result = {
    "cleaned_csv_path": cleaned_csv_path,
    "duplicates_remaining": remaining_duplicates_check.any()  # True if any duplicates remain, False otherwise
}

In [None]:
# Merge EEG data with Behavior and CBD level information

# Load the CSV file containing behavior and CBD level data
cbd_levels_df = pd.read_csv('00_Subjects_Info/Subjects_Info_Behavior_CBD_Levels.csv')

# Load the cleaned EEG session data (that has no duplicates)
session_info_df = pd.read_csv(cleaned_csv_path)

# Define the columns to be added from the CBD and behavior data
columns_to_concat = [
    'Time Difference (eegassessment-cbdlevel)', 'CBD', 'OHCBD', 'COOHCBD', 'AEA',
    'rbs_stereotyped_behavior_subscale', 'rbs_self_injurious_behavior_subscale',
    'rbs_complusive_behavior_subscale', 'rbs_ritualistic_behavior_subscale',
    'rbs_sameness_behavior_subscale', 'rbs_restricted_behavior_subscale',
    'rbs_total_score', 'ppvt_raw_score', 'ppvt_standard_score',
    'toni4_raw_score', 'toni4_index_score', 'eowpvt4_raw_score',
    'eowpvt4_standard_score', 'beery_vmi_raw_score', 'beery_vmi_standard_score',
    'beery_vp_raw_score', 'beery_vp_standard_score', 'beery_mc_raw_score',
    'beery_mc_standard_score'
]

# Perform an inner join to merge the datasets
# This keeps only the rows that have matching 'Record ID' and 'Timepoint' in both datasets
merged_df = session_info_df.merge(
    cbd_levels_df[['Record ID', 'Timepoint'] + columns_to_concat],
    on=['Record ID', 'Timepoint'],
    how='inner'
)

# Remove any duplicate rows based on 'Record ID', 'Event Name', 'Randomization', and 'Timepoint'
# If duplicates exist, keep the last occurrence
merged_df.drop_duplicates(['Record ID', 'Event Name', 'Randomization', 'Timepoint'], 
                          keep='last', inplace=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv(merged_csv_path, index=False)