In [1]:
import pandas as pd
import numpy as np
from math import log2
import os
import glob
from tqdm import tqdm  # Import the progress bar library

In [2]:
# Mapping dictionary
mapping = {
    "TaskBuilding_Public": "Task_Building",
    "TaskBuilding_Residential": "Task_Building",
    "Active_Agent": "Agent_Body",
    "Passive_Agent": "Agent_Body",
    "Active_Agent_Face": "Agent_Face",
    "Passive_Agent_Face": "Agent_Face"
}

# Collider list
collider_list = [
    '56_Sa', '39_Sa', '19_Cma', '55_Sa', '25_Cma', '40_Sa', '41_Sa',
    '17_Cma', '47_Sa', '03_Cma', '13_Cma', '24_Cma', '01_Cma', '54_Sa',
    '15_Cma', '29_Sa', '04_Cma', '49_Sa', '30_Sa', '02_Cma', '51_Sa',
    '08_Cma', '28_Cma', '26_Cma', '44_Sa', '06_Cma', '53_Sa', '37_Sa',
    '32_Sa', '20_Cma', '16_Cma', '50_Sa', '34_Sa', '11_Cma', '38_Sa',
    '33_Sa', '12_Cma', '22_Cma', '42_Sa', '05_Cma', '23_Cma', '18_Cma',
    '27_Cma', '45_Sa', '43_Sa', '09_Cma', '31_Sa', '48_Sa', '10_Cma',
    '52_Sa', '07_Cma', '46_Sa', '35_Sa', '36_Sa', '21_Cma', '14_Cma'
]


In [3]:
# Define paths
input_dir = "/Volumes/TwoTeras/1_Experiment_2/Eye_Tracking/Pre_processed/05_Debbies_gaze/"
output_trials_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/trials_df/"
output_transition_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/transition_matrix/"
output_entropy_dir = "/Volumes/TwoTeras/1_Experiment_2/Entropy_Results/Window/entropy_results/"

# Ensure output directories exist
os.makedirs(output_trials_dir, exist_ok=True)
os.makedirs(output_transition_dir, exist_ok=True)
os.makedirs(output_entropy_dir, exist_ok=True)


# Get list of files to process
file_paths = glob.glob(os.path.join(input_dir, "*.csv"))

# Function to calculate transition entropy
def calculate_transition_entropy(matrix, stationary_distribution):
    total_entropy = 0
    category_entropies = {}
    for i, row in matrix.iterrows():
        row_entropy = sum(-p * log2(p) for p in row if p > 0)
        category_entropies[i] = row_entropy
        total_entropy += row_entropy * stationary_distribution.get(i, 0)
    return total_entropy, category_entropies

# Process all CSV files
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    # Load the data
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')

    # Filter for the desired gaze events
    data_Reduced = data[data['events'] == -2]

    # Filter and label rows with colliders
    filtered_df = data_Reduced[data_Reduced['names'].isin(collider_list)].copy()
    #filtered_df = data_Reduced[data_Reduced['Collider_CategoricalN'].str.contains('TaskBuilding', na=False)].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1

    # Segment data by each occurrence of colliders
    last_processed_time = {}
    trials = []
    for index, row in filtered_df.iterrows():
        collider_name = row['names']
        occurrence_time = row['date_seconds']

        # Check if this occurrence falls within the active window
        if (
            collider_name in last_processed_time
            and (occurrence_time - last_processed_time[collider_name]).total_seconds() <= 50
        ):
            # Skip this occurrence since it's within the 30-second window
            continue

        # Update the last processed time for this collider
        last_processed_time[collider_name] = occurrence_time

        # Constrain the 30-second window to the dataset bounds
        window_start = max(data_Reduced['date_seconds'].min(), occurrence_time - pd.Timedelta(seconds=5))
        window_end = min(data_Reduced['date_seconds'].max(), occurrence_time + pd.Timedelta(seconds=55))

        # Extract the constrained window
        trial_segment = data_Reduced[
            (data_Reduced['date_seconds'] >= window_start) &
            (data_Reduced['date_seconds'] <= window_end)
        ].copy()

        if trial_segment.empty:
            continue

        # Add trial-specific labels
        trial_segment['Collider_Name'] = collider_name
        trial_segment['Occurrence_Order'] = len(trials) + 1  # Increment trial count
        trial_segment['Trial_ID'] = f"{collider_name}_Trial_{len(trials) + 1}"
        trials.append(trial_segment)

    # Combine all trials into a single DataFrame
    if trials:
        trials_df = pd.concat(trials, ignore_index=True)
    else:
        continue  # Skip this file if no trials are found

    # Calculate entropy for each trial
    entropy_results = []
    for trial_id, trial_data in trials_df.groupby('Trial_ID'):
        collider_name = trial_data['Collider_Name'].iloc[0]
        occurrence_order = trial_data['Occurrence_Order'].iloc[0]

        # Build transition matrix
        # Apply mapping and extract gaze sequence
        trial_data = trial_data.copy()
        trial_data["Mapped_Column_Collider_Categorical"] = trial_data["Collider_CategoricalN"].replace(mapping)
        gaze_sequence = trial_data['Mapped_Column_Collider_Categorical'].reset_index(drop=True)
        categories = gaze_sequence.unique()
        transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)

        for i in range(len(gaze_sequence) - 1):
            current_category = gaze_sequence.iloc[i]
            next_category = gaze_sequence.iloc[i + 1]
            transition_matrix.loc[current_category, next_category] += 1

        # Normalize transition matrix
        transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0).fillna(0)

        # Calculate stationary distribution
        try:
            eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
            stationary_distribution = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
            stationary_distribution /= stationary_distribution.sum()
            stationary_distribution_dict = {categories[i]: stationary_distribution[i] for i in range(len(categories))}
        except:
            stationary_distribution_dict = {category: 1 / len(categories) for category in categories}

        # Calculate entropies
        overall_transition_entropy, transition_entropy_per_category = calculate_transition_entropy(
            transition_matrix, stationary_distribution_dict
        )

        # Stationary entropy per category
        stationary_entropy_per_category = {
            category: (-stationary_distribution_dict[category] * log2(stationary_distribution_dict[category]))
            if stationary_distribution_dict[category] > 0 else 0
            for category in categories
        }

        # Normalize entropies
        num_categories = len(categories)
        normalized_overall_entropy = overall_transition_entropy / log2(num_categories) if num_categories > 1 else 0
        normalized_transition_entropy_per_category = {
            category: entropy / log2(num_categories) if num_categories > 1 else 0
            for category, entropy in transition_entropy_per_category.items()
        }
        normalized_stationary_entropy_per_category = {
            category: entropy / log2(num_categories) if num_categories > 1 else 0
            for category, entropy in stationary_entropy_per_category.items()
        }

        # Store results for this trial
        result = {
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Gaze_Sequence_Length': len(gaze_sequence),
            'Overall_Transition_Entropy': normalized_overall_entropy
            
        }

        # Add per-category entropies to the result
        for category in categories:
            result[f'Transition_Entropy_{category}'] = normalized_transition_entropy_per_category.get(category, 0)
            result[f'Stationary_Entropy_{category}'] = normalized_stationary_entropy_per_category.get(category, 0)

        entropy_results.append(result)

    # Save entropy results for the file
    participant_id = file_path[-10:-4]
    entropy_df = pd.DataFrame(entropy_results)
    entropy_df.to_csv(os.path.join(output_entropy_dir, f"{participant_id}_entropy_results.csv"), index=False)

Processing Files: 100%|█████████████████████| 145/145 [06:44<00:00,  2.79s/file]
