In [1]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import timedelta


In [2]:
# Mapping dictionary
mapping = {
    "TaskBuilding_Public": "Task_Building",
    "TaskBuilding_Residential": "Task_Building",
    "Active_Agent": "Agent",
    "Passive_Agent": "Agent",
    "Active_Agent_Face": "Agent",
    "Passive_Agent_Face": "Agent"
}

# Collider list
collider_list = [
    '56_Sa', '39_Sa', '19_Cma', '55_Sa', '25_Cma', '40_Sa', '41_Sa',
    '17_Cma', '47_Sa', '03_Cma', '13_Cma', '24_Cma', '01_Cma', '54_Sa',
    '15_Cma', '29_Sa', '04_Cma', '49_Sa', '30_Sa', '02_Cma', '51_Sa',
    '08_Cma', '28_Cma', '26_Cma', '44_Sa', '06_Cma', '53_Sa', '37_Sa',
    '32_Sa', '20_Cma', '16_Cma', '50_Sa', '34_Sa', '11_Cma', '38_Sa',
    '33_Sa', '12_Cma', '22_Cma', '42_Sa', '05_Cma', '23_Cma', '18_Cma',
    '27_Cma', '45_Sa', '43_Sa', '09_Cma', '31_Sa', '48_Sa', '10_Cma',
    '52_Sa', '07_Cma', '46_Sa', '35_Sa', '36_Sa', '21_Cma', '14_Cma'
]


In [3]:
# Function to calculate Chao-Shen entropy
def chao_shen(q):
    yx = q[q > 0]  # Remove zero-count bins
    n = np.sum(yx)
    p = yx.astype(float) / n
    f1 = np.sum(yx == 1)
    if f1 == n:
        f1 -= 1
    C = 1 - (f1 / n)  # Sample coverage
    pa = C * p
    la = 1 - (1 - pa) ** n
    H = -np.sum((pa * np.log2(pa)) / la)
    return H if n > 1 else 0  # Ensure valid entropy calculation

In [None]:
# Compute transition matrices using mapped categories
def compute_transition_entropy(window_data):
    if window_data.empty:
        return np.nan
    gaze_sequence = window_data['Mapped_Category'].reset_index(drop=True)
    categories = gaze_sequence.unique()
    if len(categories) < 2:
        return np.nan
    transition_matrix = pd.DataFrame(0, index=categories, columns=categories, dtype=float)
    for i in range(len(gaze_sequence) - 1):
        transition_matrix.loc[gaze_sequence.iloc[i], gaze_sequence.iloc[i + 1]] += 1
    transition_sums = transition_matrix.sum(axis=1)
    if transition_sums.sum() == 0:
        return np.nan  # Avoid division errors
    H = chao_shen(transition_sums.values)
    return H / np.log2(len(categories)) if len(categories) > 1 else 0

In [4]:
# 
# Define paths
input_dir = "/Volumes/TwoTeras/0_Experiment_1/Eye_Tracking/Pre_processed/05_Debbies_gaze/"
output_entropy_dir = "/Volumes/TwoTeras/0_Experiment_1/Entropy_Results/Window/entropy_results/CausalImpact/"
os.makedirs(output_entropy_dir, exist_ok=True)

file_paths = glob.glob(os.path.join(input_dir, "*.csv"))

# Process all CSV files
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    participant_id = os.path.basename(file_path).split('.')[0]
    data = pd.read_csv(file_path)
    data['date_seconds'] = pd.to_datetime(data['timeStampDataPointEnd'], unit='s')
    
    # Map categories
    data['Mapped_Category'] = data['Collider_CategoricalN'].replace(mapping)
    
    # Identify gaze events on colliders and calculate occurrence order
    data_reduced = data[data['events'] == -2]
    filtered_df = data[(data['events'] == -2) & (data['names'].isin(collider_list))].copy()
    filtered_df['Occurrence_Order'] = filtered_df.groupby('names').cumcount() + 1
    
    last_processed_time = {}  # Track last processed time per collider
    results = []
    for trial_index, (index, row) in enumerate(filtered_df.iterrows(), start=1):
        gaze_time = row['date_seconds']
        collider_name = row['names']
        occurrence_order = row['Occurrence_Order']
        trial_id = f"{collider_name}_Trial_{trial_index}"
        
        # Ensure no overlapping windows
        if collider_name in last_processed_time and (gaze_time - last_processed_time[collider_name]).total_seconds() <= 30:
            continue  # Skip overlapping trials
        last_processed_time[collider_name] = gaze_time
        
        # Ensure window boundaries do not exceed data limits
        pre_window_start = max(data_reduced['date_seconds'].min(), gaze_time - timedelta(seconds=30))
        pre_window_end = min(gaze_time, data_reduced['date_seconds'].max())
        post_window_start = max(gaze_time, data_reduced['date_seconds'].min())
        post_window_end = min(data_reduced['date_seconds'].max(), gaze_time + timedelta(seconds=30))
        
        # Extract gaze sequences for both windows
        pre_window_data = data_reduced[(data_reduced['date_seconds'] >= pre_window_start) & (data_reduced['date_seconds'] <= pre_window_end)]
        post_window_data = data_reduced[(data_reduced['date_seconds'] >= post_window_start) & (data_reduced['date_seconds'] <= post_window_end)]
        pre_entropy = compute_transition_entropy(pre_window_data)
        post_entropy = compute_transition_entropy(post_window_data)
        
        # Store results
        results.append({
            'Participant_ID': participant_id,
            'Trial_ID': trial_id,
            'Collider_Name': collider_name,
            'Occurrence_Order': occurrence_order,
            'Gaze_Time': gaze_time,
            'Pre_Entropy': pre_entropy,
            'Post_Entropy': post_entropy
        })
    
    # Save results for each participant separately
    entropy_df = pd.DataFrame(results)
    entropy_df.to_csv(os.path.join(output_entropy_dir, f"CausalImpact_entropy_{participant_id}.csv"), index=False)


Processing Files: 100%|█████████████████████| 145/145 [04:19<00:00,  1.79s/file]


In [5]:
for file_path in tqdm(file_paths, desc="Processing Files", unit="file"):
    participant_id = os.path.basename(file_path).split('.')[0]

Processing Files: 100%|█████████████████| 145/145 [00:00<00:00, 534423.62file/s]


In [6]:
participant_id

'9586_5'