In [1]:
import pandas as pd
import numpy as np
from math import log2
import os
import glob
from tqdm import tqdm  # Import the progress bar library

In [2]:
# Mapping dictionary
mapping = {
    "TaskBuilding_Public": "Task_Building",
    "TaskBuilding_Residential": "Task_Building",
    "Active_Agent": "Agent",
    "Passive_Agent": "Agent",
    "Active_Agent_Face": "Agent",
    "Passive_Agent_Face": "Agent"
}

# Collider list
collider_list = [
    '56_Sa', '39_Sa', '19_Cma', '55_Sa', '25_Cma', '40_Sa', '41_Sa',
    '17_Cma', '47_Sa', '03_Cma', '13_Cma', '24_Cma', '01_Cma', '54_Sa',
    '15_Cma', '29_Sa', '04_Cma', '49_Sa', '30_Sa', '02_Cma', '51_Sa',
    '08_Cma', '28_Cma', '26_Cma', '44_Sa', '06_Cma', '53_Sa', '37_Sa',
    '32_Sa', '20_Cma', '16_Cma', '50_Sa', '34_Sa', '11_Cma', '38_Sa',
    '33_Sa', '12_Cma', '22_Cma', '42_Sa', '05_Cma', '23_Cma', '18_Cma',
    '27_Cma', '45_Sa', '43_Sa', '09_Cma', '31_Sa', '48_Sa', '10_Cma',
    '52_Sa', '07_Cma', '46_Sa', '35_Sa', '36_Sa', '21_Cma', '14_Cma'
]


In [3]:
# Define paths
input_dir = "/Volumes/TwoTeras/0_Experiment_1/Eye_Tracking/Pre_processed/05_Debbies_gaze/"
output_trials_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/trials_df/"
output_transition_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/transition_matrix/"
output_entropy_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/entropy_results/"

# Ensure output directories exist
os.makedirs(output_trials_dir, exist_ok=True)
os.makedirs(output_transition_dir, exist_ok=True)
os.makedirs(output_entropy_dir, exist_ok=True)


# Get list of files to process
file_paths = glob.glob(os.path.join(input_dir, "*.csv"))

In [4]:
# Function to calculate Chao-Shen entropy
def chao_shen(q):
    yx = q[q > 0]  # Remove bins with zero counts
    n = np.sum(yx)  # Total count
    p = yx.astype(float) / n  # Observed probabilities
    f1 = np.sum(yx == 1)  # Number of singletons in the sample

    if f1 == n:  # Avoid division by zero when all are singletons
        f1 -= 1

    C = 1 - (f1 / n)  # Estimated sample coverage
    pa = C * p  # Coverage-adjusted probabilities
    la = 1 - (1 - pa) ** n  # Probability of observing each category
    H = -np.sum((pa * np.log2(pa)) / la)  # Chao-Shen entropy

    return H, pa, la

# Function to calculate Chao-Shen transition entropy
def calculate_chao_shen_transition_entropy(raw_matrix, stationary_distribution):
    """
    Calculate entropy with Chao-Shen correction using the raw transition matrix counts.
    """
    chao_shen_total_entropy = 0
    chao_shen_category_entropies = {}

    for i, row in raw_matrix.iterrows():
        counts = row.values.astype(int)  # Use raw counts for Chao-Shen
        if (counts.sum() > 1) & (len(row[row == 0.0]) < (len(row) - 1)):
            H, _, _ = chao_shen(counts)  # Apply Chao-Shen correction
            chao_shen_category_entropies[i] = H
            chao_shen_total_entropy += H * stationary_distribution.get(i, 0)  # Weighted by stationary distribution
    return chao_shen_total_entropy, chao_shen_category_entropies


# Function to calculate transition entropy
def calculate_transition_entropy(matrix, stationary_distribution):
    total_entropy = np.NaN
    category_entropies = {}
    for i, row in matrix.iterrows():
        row_entropy = sum(-p * log2(p) for p in row if p > 0)
        category_entropies[i] = row_entropy
        total_entropy += row_entropy * stationary_distribution.get(i, 0)
    return total_entropy, category_entropies


In [5]:
# Process all CSV files
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    # Extract participant ID from the file path
    participant_id = os.path.basename(file_path).split('.')[0]

    # Load the data
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')

    # Filter for the desired gaze events
    data_Reduced = data[data['events'] == -2]

    # Filter and label rows with colliders
    filtered_df = data_Reduced[data_Reduced['names'].isin(collider_list)].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1

    # Segment data by each occurrence of colliders
    last_processed_time = {}
    trials = []
    for index, row in filtered_df.iterrows():
        collider_name = row['names']
        occurrence_time = row['date_seconds']

        if (
            collider_name in last_processed_time 
            and (occurrence_time - last_processed_time[collider_name]).total_seconds() <=30
        ):
            continue

        last_processed_time[collider_name] = occurrence_time

        window_start = max(data_Reduced['date_seconds'].min(), occurrence_time)
        window_end = min(data_Reduced['date_seconds'].max(), occurrence_time + pd.Timedelta(seconds=30))

        trial_segment = data_Reduced[
            (data_Reduced['date_seconds'] >= window_start) &
            (data_Reduced['date_seconds'] <= window_end)
        ].copy()

        trial_segment['Collider_Name'] = collider_name
        trial_segment['Occurrence_Order'] = len(trials) + 1
        trial_segment['Trial_ID'] = f"{collider_name}_Trial_{len(trials) + 1}"
        trials.append(trial_segment)

    if trials:
        trials_df = pd.concat(trials, ignore_index=True)
        trials_df.to_csv(os.path.join(output_trials_dir, f"{os.path.basename(file_path).replace('.csv', '_trials.csv')}"), index=False)
    else:
        print(f"No trials found for file: {file_path}")
        continue

    # Initialize a list to store entropy results for this participant
    chao_shen_entropy_results = []

    # Calculate entropy for each trial
    for trial_id, trial_data in trials_df.groupby('Trial_ID'):
        collider_name = trial_data['Collider_Name'].iloc[0]
        occurrence_order = trial_data['Occurrence_Order'].iloc[0]

        trial_data["Mapped_Column_Collider_Categorical"] = trial_data["Collider_CategoricalN"].replace(mapping)
        gaze_sequence = trial_data['Mapped_Column_Collider_Categorical'].reset_index(drop=True)

        categories = gaze_sequence.unique()
        raw_transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)

        # Build raw transition matrix
        for i in range(len(gaze_sequence) - 1):
            current_category = gaze_sequence.iloc[i]
            next_category = gaze_sequence.iloc[i + 1]
            raw_transition_matrix.loc[current_category, next_category] += 1

        # Normalize transition matrix rows
        transition_matrix = raw_transition_matrix.div(raw_transition_matrix.sum(axis=1), axis=0).fillna(0)

        # Ensure proper normalization of the stationary distribution
        try:
            eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
            stationary_distribution = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
            stationary_distribution /= stationary_distribution.sum()

            if not np.isclose(stationary_distribution.sum(), 1):
                print(f"Warning: Stationary distribution does not sum to 1 (sum={stationary_distribution.sum()}). Normalizing.")
                stationary_distribution /= stationary_distribution.sum()

            stationary_distribution_dict = {categories[i]: stationary_distribution[i] for i in range(len(categories))}
        except Exception as e:
            print(f"Warning: Using fallback stationary distribution due to error: {e}")
            stationary_distribution_dict = {category: 1 / len(categories) for category in categories}

        # Calculate Chao-Shen transition entropy
        chao_shen_overall_transition_entropy, chao_shen_transition_entropy_per_category = calculate_chao_shen_transition_entropy(
            raw_transition_matrix, stationary_distribution_dict
        )

        # Normalize entropies
        num_categories = len(categories)
        normalized_chao_shen_overall_entropy = (
            chao_shen_overall_transition_entropy / np.log2(num_categories) if num_categories > 1 else 0
        )
        normalized_chao_shen_transition_entropy_per_category = {
            category: entropy / np.log2(num_categories) if num_categories > 1 else 0
            for category, entropy in chao_shen_transition_entropy_per_category.items()
        }

        # Store results for this trial
        chao_shen_result = {
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Gaze_Sequence_Length': len(gaze_sequence),
            'Chao_Shen_Overall_Transition_Entropy': normalized_chao_shen_overall_entropy,
        }

        # Add per-category entropies to the result
        for category in categories:
            chao_shen_result[f'Chao_Shen_Transition_Entropy_{category}'] = normalized_chao_shen_transition_entropy_per_category.get(
                category, 0
            )
        chao_shen_entropy_results.append(chao_shen_result)

    # Save all results for this participant to a single file
    chao_shen_entropy_df = pd.DataFrame(chao_shen_entropy_results)
    chao_shen_entropy_df.to_csv(os.path.join(output_entropy_dir, f"Chao_Shen_{participant_id}_entropy_results.csv"), index=False)


Processing Files:   1%|▎                      | 2/145 [00:05<06:13,  2.61s/file]



Processing Files:  10%|██▎                   | 15/145 [00:33<04:50,  2.23s/file]



Processing Files:  12%|██▋                   | 18/145 [00:40<04:49,  2.28s/file]



Processing Files:  21%|████▋                 | 31/145 [01:05<03:20,  1.76s/file]



Processing Files:  25%|█████▍                | 36/145 [01:15<03:12,  1.76s/file]



Processing Files:  29%|██████▎               | 42/145 [01:26<03:22,  1.97s/file]



Processing Files:  34%|███████▌              | 50/145 [01:42<02:53,  1.82s/file]



Processing Files:  37%|████████▏             | 54/145 [01:51<03:13,  2.12s/file]



Processing Files:  38%|████████▎             | 55/145 [01:53<03:16,  2.18s/file]



Processing Files:  41%|█████████             | 60/145 [02:04<02:50,  2.01s/file]



Processing Files:  43%|█████████▌            | 63/145 [02:10<02:36,  1.91s/file]



Processing Files:  47%|██████████▎           | 68/145 [02:20<02:37,  2.05s/file]



Processing Files:  58%|████████████▋         | 84/145 [02:56<02:30,  2.47s/file]



Processing Files:  59%|█████████████         | 86/145 [03:00<02:13,  2.26s/file]



Processing Files:  68%|██████████████▊       | 98/145 [03:27<01:45,  2.24s/file]



Processing Files:  76%|███████████████▉     | 110/145 [03:53<01:20,  2.29s/file]



Processing Files:  81%|█████████████████    | 118/145 [04:12<01:03,  2.35s/file]



Processing Files:  82%|█████████████████▏   | 119/145 [04:15<01:01,  2.35s/file]



Processing Files:  83%|█████████████████▍   | 120/145 [04:17<00:58,  2.36s/file]



Processing Files:  93%|███████████████████▌ | 135/145 [04:45<00:15,  1.53s/file]



Processing Files:  95%|███████████████████▉ | 138/145 [04:51<00:13,  1.96s/file]



Processing Files:  98%|████████████████████▌| 142/145 [04:59<00:05,  1.89s/file]



Processing Files:  99%|████████████████████▊| 144/145 [05:03<00:01,  1.78s/file]



Processing Files: 100%|█████████████████████| 145/145 [05:04<00:00,  2.10s/file]


In [6]:
participant_id

'9586_5'


# Function to calculate transition entropy
def calculate_transition_entropy(matrix, stationary_distribution):
    total_entropy = 0
    category_entropies = {}
    for i, row in matrix.iterrows():
        row_entropy = sum(-p * log2(p) for p in row if p > 0)
        category_entropies[i] = row_entropy
        total_entropy += row_entropy * stationary_distribution.get(i, 0)
    return total_entropy, category_entropies

# Process all CSV files
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    # Load the data
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')

    # Filter for the desired gaze events
    data_Reduced = data[data['events'] == -2]

    # Filter and label rows with colliders
    filtered_df = data_Reduced[data_Reduced['names'].isin(collider_list)].copy()
    #filtered_df = data_Reduced[data_Reduced['Collider_CategoricalN'].str.contains('TaskBuilding', na=False)].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1

    # Segment data by each occurrence of colliders
    last_processed_time = {}
    trials = []
    for index, row in filtered_df.iterrows():
        collider_name = row['names']
        occurrence_time = row['date_seconds']

        # Check if this occurrence falls within the active window
        if (
            collider_name in last_processed_time
            and (occurrence_time - last_processed_time[collider_name]).total_seconds() <= 30
        ):
            # Skip this occurrence since it's within the 30-second window
            continue

        # Update the last processed time for this collider
        last_processed_time[collider_name] = occurrence_time

        # Constrain the 60-second window to the dataset bounds
        window_start = max(data_Reduced['date_seconds'].min(), occurrence_time - pd.Timedelta(seconds=5))
        window_end = min(data_Reduced['date_seconds'].max(), occurrence_time + pd.Timedelta(seconds=25))

        # Extract the constrained window
        trial_segment = data_Reduced[
            (data_Reduced['date_seconds'] >= window_start) &
            (data_Reduced['date_seconds'] <= window_end)
        ].copy()

        if trial_segment.empty:
            continue

        # Add trial-specific labels
        trial_segment['Collider_Name'] = collider_name
        trial_segment['Occurrence_Order'] = len(trials) + 1  # Increment trial count
        trial_segment['Trial_ID'] = f"{collider_name}_Trial_{len(trials) + 1}"
        trials.append(trial_segment)

    # Combine all trials into a single DataFrame
    if trials:
        trials_df = pd.concat(trials, ignore_index=True)
    else:
        continue  # Skip this file if no trials are found

    # Calculate entropy for each trial
    entropy_results = []
    for trial_id, trial_data in trials_df.groupby('Trial_ID'):
        collider_name = trial_data['Collider_Name'].iloc[0]
        occurrence_order = trial_data['Occurrence_Order'].iloc[0]

        # Build transition matrix
        # Apply mapping and extract gaze sequence
        trial_data = trial_data.copy()
        trial_data["Mapped_Column_Collider_Categorical"] = trial_data["Collider_CategoricalN"].replace(mapping)
        gaze_sequence = trial_data['Mapped_Column_Collider_Categorical'].reset_index(drop=True)
        categories = gaze_sequence.unique()
        transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)

        for i in range(len(gaze_sequence) - 1):
            current_category = gaze_sequence.iloc[i]
            next_category = gaze_sequence.iloc[i + 1]
            transition_matrix.loc[current_category, next_category] += 1

        # Normalize transition matrix
        transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0).fillna(0)

        # Calculate stationary distribution
        try:
            eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
            stationary_distribution = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
            stationary_distribution /= stationary_distribution.sum()
            stationary_distribution_dict = {categories[i]: stationary_distribution[i] for i in range(len(categories))}
        except:
            stationary_distribution_dict = {category: 1 / len(categories) for category in categories}

        # Calculate entropies
        overall_transition_entropy, transition_entropy_per_category = calculate_transition_entropy(
            transition_matrix, stationary_distribution_dict
        )

        # Stationary entropy per category
        stationary_entropy_per_category = {
            category: (-stationary_distribution_dict[category] * log2(stationary_distribution_dict[category]))
            if stationary_distribution_dict[category] > 0 else 0
            for category in categories
        }

        # Normalize entropies
        num_categories = len(categories)
        normalized_overall_entropy = overall_transition_entropy / log2(num_categories) if num_categories > 1 else 0
        normalized_transition_entropy_per_category = {
            category: entropy / log2(num_categories) if num_categories > 1 else 0
            for category, entropy in transition_entropy_per_category.items()
        }
        normalized_stationary_entropy_per_category = {
            category: entropy / log2(num_categories) if num_categories > 1 else 0
            for category, entropy in stationary_entropy_per_category.items()
        }

        # Store results for this trial
        result = {
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Gaze_Sequence_Length': len(gaze_sequence),
            'Overall_Transition_Entropy': normalized_overall_entropy
            
        }

        # Add per-category entropies to the result
        for category in categories:
            result[f'Transition_Entropy_{category}'] = normalized_transition_entropy_per_category.get(category, 0)
            result[f'Stationary_Entropy_{category}'] = normalized_stationary_entropy_per_category.get(category, 0)

        entropy_results.append(result)

    # Save entropy results for the file
    participant_id = file_path[-10:-4]
    entropy_df = pd.DataFrame(entropy_results)
    entropy_df.to_csv(os.path.join(output_entropy_dir, f"{participant_id}_entropy_results.csv"), index=False)


import pandas as pd
import numpy as np
from math import log2
import os
import glob
from tqdm import tqdm  # Import the progress bar library
# Define paths
input_dir = "/Volumes/TwoTeras/0_Experiment_1/Eye_Tracking/Pre_processed/05_Debbies_gaze/"
output_trials_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/trials_df/"
output_transition_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/transition_matrix/"
output_entropy_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/entropy_results/"

# Ensure output directories exist
os.makedirs(output_trials_dir, exist_ok=True)
os.makedirs(output_transition_dir, exist_ok=True)
os.makedirs(output_entropy_dir, exist_ok=True)

# Collider list
collider_list = [
    '56_Sa', '39_Sa', '19_Cma', '55_Sa', '25_Cma', '40_Sa', '41_Sa',
    '17_Cma', '47_Sa', '03_Cma', '13_Cma', '24_Cma', '01_Cma', '54_Sa',
    '15_Cma', '29_Sa', '04_Cma', '49_Sa', '30_Sa', '02_Cma', '51_Sa',
    '08_Cma', '28_Cma', '26_Cma', '44_Sa', '06_Cma', '53_Sa', '37_Sa',
    '32_Sa', '20_Cma', '16_Cma', '50_Sa', '34_Sa', '11_Cma', '38_Sa',
    '33_Sa', '12_Cma', '22_Cma', '42_Sa', '05_Cma', '23_Cma', '18_Cma',
    '27_Cma', '45_Sa', '43_Sa', '09_Cma', '31_Sa', '48_Sa', '10_Cma',
    '52_Sa', '07_Cma', '46_Sa', '35_Sa', '36_Sa', '21_Cma', '14_Cma'
]

# Get list of files to process
file_paths = glob.glob(os.path.join(input_dir, "*.csv"))

# Process all CSV files with a progress bar
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    # Load the data
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')

    # Filter for the desired gaze events
    data_Reduced = data[data['events'] == -2]

    # Filter and label rows with colliders
    filtered_df = data_Reduced[data_Reduced['names'].isin(collider_list)].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1

    # Maintain a dictionary to track the last processed time for each collider
    last_processed_time = {}

    # Segment data by each occurrence of colliders
    trials = []
    for index, row in filtered_df.iterrows():
        collider_name = row['names']
        occurrence_time = row['date_seconds']

        # Check if this occurrence falls within the active window
        if (
            collider_name in last_processed_time
            and (occurrence_time - last_processed_time[collider_name]).total_seconds() <= 30
        ):
            # Skip this occurrence since it's within the 30-second window
            continue

        # Update the last processed time for this collider
        last_processed_time[collider_name] = occurrence_time

        # Constrain the 30-second window to the dataset bounds
        window_start = max(data_Reduced['date_seconds'].min(), occurrence_time)
        window_end = min(data_Reduced['date_seconds'].max(), occurrence_time + pd.Timedelta(seconds=30))

        # Extract the constrained window
        trial_segment = data_Reduced[
            (data_Reduced['date_seconds'] >= window_start) &
            (data_Reduced['date_seconds'] <= window_end)
        ].copy()

        if trial_segment.empty:
            continue

        # Add trial-specific labels
        trial_segment['Collider_Name'] = collider_name
        trial_segment['Occurrence_Order'] = len(trials) + 1  # Increment trial count
        trial_segment['Trial_ID'] = f"{collider_name}_Trial_{len(trials) + 1}"
        trials.append(trial_segment)

    # Combine all trials into a single DataFrame
    if trials:
        trials_df = pd.concat(trials, ignore_index=True)
    else:
        continue  # Skip this file if no trials are found

    # Save trials_df
    participant_id = file_path[-10:-4]
    trials_df.to_csv(os.path.join(output_trials_dir, f"{participant_id}_trials_df.csv"), index=False)

    # Calculate transition matrices and entropy
    entropy_results = []
    for trial_id, trial_data in trials_df.groupby('Trial_ID'):
        collider_name = trial_data['Collider_Name'].iloc[0]
        occurrence_order = trial_data['Occurrence_Order'].iloc[0]

        gaze_sequence = trial_data['Collider_CategoricalN'].reset_index(drop=True)
        categories = gaze_sequence.unique()
        transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)

        # Build the transition matrix
        for i in range(len(gaze_sequence) - 1):
            current_category = gaze_sequence.iloc[i]
            next_category = gaze_sequence.iloc[i + 1]
            transition_matrix.loc[current_category, next_category] += 1

        # Normalize the transition matrix
        transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0).fillna(0)

        # Save transition matrix
        transition_matrix.to_csv(os.path.join(output_transition_dir, f"{participant_id}_transition_matrix.csv"))

        # Calculate stationary distribution
        try:
            eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
            stationary_distribution = np.real(eigvecs[:, np.isclose(eigvals, 1)].flatten())
            stationary_distribution /= stationary_distribution.sum()

            stationary_distribution_dict = {categories[i]: stationary_distribution[i] for i in range(len(categories))}
        except:
            stationary_distribution_dict = {category: 1 / len(categories) for category in categories}

        # Calculate entropy
        def calculate_transition_entropy(matrix, stationary_distribution):
            total_entropy = 0
            for i, row in matrix.iterrows():
                row_entropy = sum(-p * log2(p) for p in row if p > 0)
                total_entropy += row_entropy * stationary_distribution.get(i, 0)
            return total_entropy

        overall_transition_entropy = calculate_transition_entropy(transition_matrix, stationary_distribution_dict)

        num_categories = len(transition_matrix)
        normalized_overall_entropy = overall_transition_entropy / log2(num_categories) if num_categories > 1 else 0

        result = {
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Overall_Transition_Entropy': normalized_overall_entropy
        }
        entropy_results.append(result)

    # Save entropy results
    entropy_df = pd.DataFrame(entropy_results)
    entropy_df.to_csv(os.path.join(output_entropy_dir, f"{participant_id}_entropy_results.csv"), index=False)