In [1]:
import pandas as pd
import numpy as np
from math import log2
import os
import glob
from tqdm import tqdm  # Import the progress bar library

In [6]:
# Define paths
input_dir = "/Volumes/TwoTeras/1_Experiment_2/Eye_Tracking/Pre_processed/05_Debbies_gaze/"
output_trials_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/trials_df/"
output_transition_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/transition_matrix/"
output_entropy_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/entropy_results/"

# Ensure output directories exist
os.makedirs(output_trials_dir, exist_ok=True)
os.makedirs(output_transition_dir, exist_ok=True)
os.makedirs(output_entropy_dir, exist_ok=True)

# Collider list
collider_list = [
    '56_Sa', '39_Sa', '19_Cma', '55_Sa', '25_Cma', '40_Sa', '41_Sa',
    '17_Cma', '47_Sa', '03_Cma', '13_Cma', '24_Cma', '01_Cma', '54_Sa',
    '15_Cma', '29_Sa', '04_Cma', '49_Sa', '30_Sa', '02_Cma', '51_Sa',
    '08_Cma', '28_Cma', '26_Cma', '44_Sa', '06_Cma', '53_Sa', '37_Sa',
    '32_Sa', '20_Cma', '16_Cma', '50_Sa', '34_Sa', '11_Cma', '38_Sa',
    '33_Sa', '12_Cma', '22_Cma', '42_Sa', '05_Cma', '23_Cma', '18_Cma',
    '27_Cma', '45_Sa', '43_Sa', '09_Cma', '31_Sa', '48_Sa', '10_Cma',
    '52_Sa', '07_Cma', '46_Sa', '35_Sa', '36_Sa', '21_Cma', '14_Cma'
]

# Get list of files to process
file_paths = glob.glob(os.path.join(input_dir, "*.csv"))

# Process all CSV files with a progress bar
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    # Load the data
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')

    # Filter for the desired gaze events
    data_Reduced = data[data['events'] == -2]

    # Filter and label rows with colliders
    filtered_df = data_Reduced[data_Reduced['names'].isin(collider_list)].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1

    # Maintain a dictionary to track the last processed time for each collider
    last_processed_time = {}

    # Segment data by each occurrence of colliders
    trials = []
    for index, row in filtered_df.iterrows():
        collider_name = row['names']
        occurrence_time = row['date_seconds']

        # Check if this occurrence falls within the active window
        if (
            collider_name in last_processed_time
            and (occurrence_time - last_processed_time[collider_name]).total_seconds() <= 30
        ):
            # Skip this occurrence since it's within the 30-second window
            continue

        # Update the last processed time for this collider
        last_processed_time[collider_name] = occurrence_time

        # Constrain the 30-second window to the dataset bounds
        window_start = max(data_Reduced['date_seconds'].min(), occurrence_time)
        window_end = min(data_Reduced['date_seconds'].max(), occurrence_time + pd.Timedelta(seconds=30))

        # Extract the constrained window
        trial_segment = data_Reduced[
            (data_Reduced['date_seconds'] >= window_start) &
            (data_Reduced['date_seconds'] <= window_end)
        ].copy()

        if trial_segment.empty:
            continue

        # Add trial-specific labels
        trial_segment['Collider_Name'] = collider_name
        trial_segment['Occurrence_Order'] = len(trials) + 1  # Increment trial count
        trial_segment['Trial_ID'] = f"{collider_name}_Trial_{len(trials) + 1}"
        trials.append(trial_segment)

    # Combine all trials into a single DataFrame
    if trials:
        trials_df = pd.concat(trials, ignore_index=True)
    else:
        continue  # Skip this file if no trials are found

    # Save trials_df
    participant_id = file_path[-10:-4]
    trials_df.to_csv(os.path.join(output_trials_dir, f"{participant_id}_trials_df.csv"), index=False)

    # Calculate transition matrices and entropy
    entropy_results = []
    for trial_id, trial_data in trials_df.groupby('Trial_ID'):
        collider_name = trial_data['Collider_Name'].iloc[0]
        occurrence_order = trial_data['Occurrence_Order'].iloc[0]

        gaze_sequence = trial_data['Collider_CategoricalN'].reset_index(drop=True)
        categories = gaze_sequence.unique()
        transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)

        # Build the transition matrix
        for i in range(len(gaze_sequence) - 1):
            current_category = gaze_sequence.iloc[i]
            next_category = gaze_sequence.iloc[i + 1]
            transition_matrix.loc[current_category, next_category] += 1

        # Normalize the transition matrix
        transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0).fillna(0)

        # Save transition matrix
        transition_matrix.to_csv(os.path.join(output_transition_dir, f"{participant_id}_transition_matrix.csv"))

        # Calculate stationary distribution
        try:
            eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
            stationary_distribution = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
            stationary_distribution /= stationary_distribution.sum()

            stationary_distribution_dict = {categories[i]: stationary_distribution[i] for i in range(len(categories))}
        except:
            stationary_distribution_dict = {category: 1 / len(categories) for category in categories}

        # Calculate entropy
        def calculate_transition_entropy(matrix, stationary_distribution):
            total_entropy = 0
            for i, row in matrix.iterrows():
                row_entropy = sum(-p * log2(p) for p in row if p > 0)
                total_entropy += row_entropy * stationary_distribution.get(i, 0)
            return total_entropy

        overall_transition_entropy = calculate_transition_entropy(transition_matrix, stationary_distribution_dict)

        num_categories = len(transition_matrix)
        normalized_overall_entropy = overall_transition_entropy / log2(num_categories) if num_categories > 1 else 0

        result = {
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Overall_Transition_Entropy': normalized_overall_entropy
        }
        entropy_results.append(result)

    # Save entropy results
    entropy_df = pd.DataFrame(entropy_results)
    entropy_df.to_csv(os.path.join(output_entropy_dir, f"{participant_id}_entropy_results.csv"), index=False)


Processing Files: 100%|█████████████████████| 145/145 [05:33<00:00,  2.30s/file]
