# PCA on High-Level Dimensions

In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import display
import analysis_utils as au
import altair as alt
import plot

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

In [3]:
try:
    df_raw = pd.read_csv('./data/super_experiment_design_space.csv')
    print(f"Successfully loaded raw dataset. Shape: {df_raw.shape}")
except FileNotFoundError:
    print("Error: './data/super_experiment_design_space.csv' not found.")

df_pca_features, numerical_cols, categorical_cols, df_processed, preprocessor = au.preprocess(df_raw, merge_conflict_dimensions=True)
print("\nPreprocessing complete.")
display(df_pca_features.head())
assert df_pca_features[df_pca_features.isnull().any(axis=1)].size == 0

Successfully loaded raw dataset. Shape: (349, 24)

Preprocessing complete.


Unnamed: 0,Task 2 Response Probability,Inter-task SOA,Distractor SOA,Task 1 CSI,Task 2 CSI,RSI,Switch Rate,Task 1 Difficulty,Task 2 Difficulty,Inter-task SOA is NA,Distractor SOA is NA,Task 2 CSI is NA,Task 2 Difficulty is NA,Response Set Overlap Mapped,RSI is Predictable,Task 1 Stimulus-Response Mapping Mapped,Task 1 Cue Type Mapped,Task 2 Stimulus-Response Mapping Mapped,Task 2 Cue Type Mapped,Trial Transition Type Mapped,SBC_Mapped
0,1,500.0,0.0,0,0.0,1000.0,0.0,0.0,0.0,0,1,0,0,RSO_Identical,1,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,
1,1,1000.0,0.0,0,0.0,1000.0,0.0,0.0,0.0,0,1,0,0,RSO_Identical,1,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,
2,1,2000.0,0.0,0,0.0,1000.0,0.0,0.0,0.0,0,1,0,0,RSO_Identical,1,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,
3,1,4000.0,0.0,0,0.0,1000.0,0.0,0.0,0.0,0,1,0,0,RSO_Identical,1,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,
4,1,1000.0,0.0,0,0.0,1000.0,0.0,0.5,0.5,0,1,0,0,RSO_Identical,1,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,


### Exploratory Data Analysis

In [4]:
# List of the original categorical columns we want to inspect
categorical_columns_to_explore = [
    'Response Set Overlap',
    #'Stimulus-Stimulus Congruency',
    #'Stimulus-Response Congruency',
    'Stimulus Bivalence & Congruency',
    'Task 1 Stimulus-Response Mapping',
    'Task 1 Cue Type',
    'Task 2 Stimulus-Response Mapping',
    'Task 2 Cue Type'
]
numeric_columns_to_explore = [
    'Inter-task SOA',
    'Distractor SOA',
    'Task 1 CSI',
    'Task 2 CSI', 
    'Task 2 Difficulty'
]

# Loop through the columns and print their value counts
for col in categorical_columns_to_explore:
    print(f"\nValue counts for column: '{col}'")
    # Use dropna=False to explicitly see the count of any missing values (NaNs)
    print(df_processed[col].value_counts(dropna=False))
    print("-" * 30)

print("-" * 60)
print("Review the counts above. If a sub-category has very few examples (e.g., < 5-10),")
print("it is generally better to collapse it into a broader category for the PCA.")
print("This analysis should guide the modifications to the mapping functions in Stage 3.")

for col in numeric_columns_to_explore:
    print(col, au.check_skewness(df_processed[col]))

print("-" * 60)
print("Review the skewness of the numeric columns. If they are skewed it'll be better to")
print("impute with the median instead of the mean.")


Value counts for column: 'Response Set Overlap'
Response Set Overlap
Identical                              142
NaN                                    115
Disjoint - Modality                     32
Disjoint - Category (Same Modality)     23
Disjoint - Effector                     22
Disjoint - Modality (Standard)           8
Disjoint - Modality (Non-Standard)       7
Name: count, dtype: int64
------------------------------

Value counts for column: 'Stimulus Bivalence & Congruency'
Stimulus Bivalence & Congruency
Incongruent    108
Neutral         99
N/A             73
Congruent       69
Name: count, dtype: int64
------------------------------

Value counts for column: 'Task 1 Stimulus-Response Mapping'
Task 1 Stimulus-Response Mapping
Compatible      174
Arbitrary       171
Incompatible      4
Name: count, dtype: int64
------------------------------

Value counts for column: 'Task 1 Cue Type'
Task 1 Cue Type
None/Implicit    253
Arbitrary         96
Name: count, dtype: int64
--------

## PCA Fitting

In [5]:
pipeline = au.create_pca_pipeline(numerical_cols, categorical_cols)
pipeline.fit(df_pca_features)
pca_results = pipeline.transform(df_pca_features)
column_names = [f'PC{i+1}' for i in range(pca_results.shape[1])]

# 5. Create a Pandas DataFrame with the specified column names
pca_df = pd.DataFrame(pca_results, columns=column_names)
print("PCA pipeline fitted successfully.")

PCA pipeline fitted successfully.


In [35]:
pca = pipeline.named_steps['pca']
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print("\n--- Explained Variance ---")
for i, (var, cum_var) in enumerate(zip(explained_variance, cumulative_variance)):
    if cum_var > 0.95: break
    print(f"PC{i+1}: Explained Variance = {var:.3f}, Cumulative Variance = {cum_var:.3f}")
    
loadings = au.get_component_loadings(pipeline, numerical_cols, categorical_cols)
print("\n--- Principal Component Loadings ---")
display(loadings.round(3))
print("Sparseness of top 3 loadings:", np.mean(au.get_loadings_sparseness(loadings.to_numpy()[:,:3])))


--- Explained Variance ---
PC1: Explained Variance = 0.203, Cumulative Variance = 0.203
PC2: Explained Variance = 0.170, Cumulative Variance = 0.374
PC3: Explained Variance = 0.124, Cumulative Variance = 0.497
PC4: Explained Variance = 0.089, Cumulative Variance = 0.587
PC5: Explained Variance = 0.078, Cumulative Variance = 0.665
PC6: Explained Variance = 0.061, Cumulative Variance = 0.726
PC7: Explained Variance = 0.049, Cumulative Variance = 0.775
PC8: Explained Variance = 0.039, Cumulative Variance = 0.814
PC9: Explained Variance = 0.033, Cumulative Variance = 0.847
PC10: Explained Variance = 0.028, Cumulative Variance = 0.875
PC11: Explained Variance = 0.021, Cumulative Variance = 0.897
PC12: Explained Variance = 0.017, Cumulative Variance = 0.914
PC13: Explained Variance = 0.016, Cumulative Variance = 0.930
PC14: Explained Variance = 0.014, Cumulative Variance = 0.945

--- Principal Component Loadings ---


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,PC41,PC42
Task 2 Response Probability,-0.157,0.439,0.335,-0.14,0.256,-0.294,0.139,0.126,-0.034,0.02,-0.071,-0.074,0.015,-0.005,0.147,-0.08,0.081,0.061,0.357,0.186,0.308,0.084,-0.019,-0.008,0.132,0.144,0.06,-0.327,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
Inter-task SOA,-0.062,0.196,0.154,-0.351,0.303,0.724,-0.172,-0.281,-0.216,0.122,-0.031,-0.064,-0.019,0.03,0.013,0.035,0.039,0.082,-0.068,-0.057,0.03,-0.009,-0.003,0.004,0.001,0.009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0
Distractor SOA,0.008,-0.049,-0.101,0.526,0.58,0.258,0.106,0.434,0.158,0.128,0.21,-0.065,-0.034,0.07,0.028,0.06,0.036,-0.008,-0.047,0.009,0.029,0.061,-0.003,-0.004,0.011,-0.003,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
Task 1 CSI,0.218,-0.349,0.457,-0.163,0.054,-0.067,-0.082,0.058,0.072,-0.104,0.287,-0.263,-0.058,-0.036,0.11,0.366,-0.451,0.22,0.048,0.031,-0.012,0.044,-0.028,-0.01,0.008,0.003,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0
Task 2 CSI,0.316,-0.258,0.434,-0.089,0.094,-0.086,-0.315,0.115,0.057,-0.022,0.098,0.078,0.015,-0.006,-0.041,-0.281,0.574,-0.229,-0.11,-0.083,0.042,-0.065,0.02,0.018,-0.013,0.002,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
RSI,-0.082,-0.301,0.297,-0.075,-0.171,0.24,0.792,0.142,-0.155,0.041,-0.123,0.053,0.014,-0.037,-0.053,-0.036,0.114,-0.07,-0.037,0.032,-0.0,-0.07,-0.013,0.002,-0.012,-0.006,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
Switch Rate,0.458,0.051,-0.052,0.185,-0.322,0.211,0.041,-0.15,0.082,0.443,0.105,-0.2,-0.059,-0.011,0.22,-0.167,0.038,-0.035,0.451,0.166,-0.013,0.09,-0.002,-0.04,-0.002,-0.031,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0
Task 1 Difficulty,-0.265,0.143,0.324,0.203,-0.389,0.141,-0.363,0.449,-0.153,0.307,-0.196,0.071,0.008,-0.12,-0.054,0.129,-0.161,-0.138,-0.102,0.067,-0.037,0.009,0.035,-0.001,0.02,0.011,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0
Task 2 Difficulty,-0.2,0.199,0.326,0.507,-0.171,0.106,0.025,-0.345,-0.066,-0.387,0.354,-0.132,-0.03,-0.07,-0.173,-0.228,-0.025,0.02,0.014,-0.085,0.017,-0.037,-0.011,0.031,-0.039,0.008,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0
Inter-task SOA is NA_0,-0.054,0.142,0.118,-0.042,0.063,-0.052,0.044,0.028,0.049,-0.01,0.015,0.016,-0.001,0.132,0.068,0.019,0.073,0.007,0.098,-0.007,-0.639,0.024,0.015,-0.014,-0.037,-0.009,-0.104,-0.019,-0.193,-0.06,0.401,0.199,0.285,0.146,0.117,0.074,-0.15,-0.113,0.038,-0.0,0.02,-0.298


Sparseness of top 3 loadings: 0.3224137603321791


### Top weights for the first two components

In [7]:
display(loadings["PC1"].sort_values(key=abs, ascending=False).round(3)[:16])

Switch Rate                                        0.458
Task 2 CSI                                         0.316
Task 1 Difficulty                                 -0.265
Trial Transition Type Mapped_TTT_Pure             -0.260
Task 1 CSI                                         0.218
Task 2 Difficulty is NA_1                         -0.201
Task 2 CSI is NA_1                                -0.201
Task 2 Stimulus-Response Mapping Mapped_SRM2_NA   -0.201
Task 2 Cue Type Mapped_TCT2_NA                    -0.201
Task 2 Difficulty is NA_0                          0.201
Task 2 CSI is NA_0                                 0.201
Task 2 Difficulty                                 -0.200
Response Set Overlap Mapped_RSO_NA                -0.197
Response Set Overlap Mapped_RSO_Identical          0.172
Task 2 Response Probability                       -0.157
Trial Transition Type Mapped_TTT_Switch            0.133
Name: PC1, dtype: float64

In [8]:
display(loadings["PC2"].sort_values(key=abs, ascending=False).round(3)[:16])

Task 2 Response Probability                               0.439
Task 1 CSI                                               -0.349
RSI                                                      -0.301
Task 2 CSI                                               -0.258
Task 2 Cue Type Mapped_TCT2_Implicit                      0.244
Task 2 Difficulty                                         0.199
Inter-task SOA                                            0.196
Task 2 Cue Type Mapped_TCT2_NA                           -0.187
Task 2 Stimulus-Response Mapping Mapped_SRM2_NA          -0.187
Task 2 CSI is NA_0                                        0.187
Task 2 Difficulty is NA_0                                 0.187
Task 2 Difficulty is NA_1                                -0.187
Task 2 CSI is NA_1                                       -0.187
Response Set Overlap Mapped_RSO_NA                       -0.186
Task 2 Stimulus-Response Mapping Mapped_SRM2_Arbitrary    0.166
Task 1 Difficulty                       

In [9]:
display(loadings["PC3"].sort_values(key=abs, ascending=False).round(3)[:16])

Task 1 CSI                                                 0.457
Task 2 CSI                                                 0.434
Task 2 Response Probability                                0.335
Task 2 Difficulty                                          0.326
Task 1 Difficulty                                          0.324
RSI                                                        0.297
Inter-task SOA                                             0.154
Response Set Overlap Mapped_RSO_Disjoint                   0.121
Inter-task SOA is NA_0                                     0.118
Inter-task SOA is NA_1                                    -0.118
Distractor SOA                                            -0.101
Task 2 Cue Type Mapped_TCT2_Arbitrary                      0.098
Distractor SOA is NA_1                                     0.097
Distractor SOA is NA_0                                    -0.097
Task 2 Stimulus-Response Mapping Mapped_SRM2_Compatible    0.086
Task 1 Cue Type Mapped_TC

In [10]:
display(loadings["PC4"].sort_values(key=abs, ascending=False).round(3)[:16])

Distractor SOA                                            0.526
Task 2 Difficulty                                         0.507
Inter-task SOA                                           -0.351
Task 1 Difficulty                                         0.203
Switch Rate                                               0.185
Task 1 Cue Type Mapped_TCT_Arbitrary                      0.176
Task 1 Cue Type Mapped_TCT_Implicit                      -0.176
Task 1 CSI                                               -0.163
Task 1 Stimulus-Response Mapping Mapped_SRM_Compatible    0.160
Task 1 Stimulus-Response Mapping Mapped_SRM_Arbitrary    -0.153
Task 2 Response Probability                              -0.140
Response Set Overlap Mapped_RSO_Disjoint                  0.129
Task 2 Cue Type Mapped_TCT2_Arbitrary                     0.122
Response Set Overlap Mapped_RSO_Identical                -0.121
Task 2 Cue Type Mapped_TCT2_Implicit                     -0.114
Task 2 CSI                              

### Find Centroids of Paradigms and Interpolate Them

In [26]:
# --- 1. Combine the cleaned DF with the DF in the PC space ---
df_processed_no_paradigm = df_processed.drop(columns=['Paradigm'])
plot_df = pd.concat([
    df_processed.reset_index(drop=True),
    pca_df.reset_index(drop=True)
], axis=1)
plot_df['Point Type'] = 'Empirical Data' # Label these as original points

# --- 2. Calculate and Prepare Centroids for Plotting ---

# First, get a list of just the Principal Component column names
pc_cols = [col for col in plot_df.columns if col.startswith('PC')]
# Find the centroids in PC space
paradigm_centroids_pc = au.find_centroids(plot_df[pc_cols + ['Paradigm']], paradigm_col='Paradigm')

# Convert the dictionary to a DataFrame for Altair
centroids_df = pd.DataFrame.from_dict(paradigm_centroids_pc, orient='index').reset_index()
centroids_df = centroids_df.rename(columns={'index': 'Paradigm'})
centroids_df['Point Type'] = 'Centroid'
print("\nCalculated Centroids in PC Space:")
display(centroids_df)

# --- 3. Interpolate Paradigms and Inverse-Transform Them Back To The Original Space ---
interpolated_points_list = []
interpolation_pairs = [
    ('Dual-Task_PRP', 'Task Switching'),
    ('Dual-Task_PRP', 'Interference'),
    ('Task Switching', 'Interference')
]

for p1_name, p2_name in interpolation_pairs:
    # Get the centroids for the pair
    centroid1 = paradigm_centroids_pc.get(p1_name)
    centroid2 = paradigm_centroids_pc.get(p2_name)
    
    if not centroid1 or not centroid2:
        print(f"Warning: Could not find centroids for pair ({p1_name}, {p2_name}). Skipping interpolation.")
        continue

    # Interpolate to find the midpoint in PC space
    # We need to get just the PC values for interpolation
    pc_cols = [f'PC{i+1}' for i in range(pca_results.shape[1])]
    interpolated_pc_coords = au.interpolate_centroids(
        {k: v for k, v in centroid1.items() if k in pc_cols},
        {k: v for k, v in centroid2.items() if k in pc_cols},
        alpha=0.5
    )
    
    # Inverse transform the point back to the original high-level dimension space
    original_space_params = au.inverse_transform_point(interpolated_pc_coords, pipeline)
    
    # Create a dictionary for this new point, including its PC coordinates
    new_point = original_space_params.to_dict()
    for i, pc_coord in enumerate(interpolated_pc_coords):
        new_point[f'PC{i+1}'] = pc_coord
        
    # Add metadata for this point
    new_point['Point Type'] = 'Interpolated'
    new_point['Experiment'] = f"Interpolation: {p1_name} <-> {p2_name}"
    new_point['Paradigm'] = 'Interpolated Point' # Assign a unique paradigm name for styling
    new_point['Parent1'] = p1_name
    new_point['Parent2'] = p2_name
    
    interpolated_points_list.append(new_point)

# Create a DataFrame from the list of interpolated points
interpolated_df = pd.DataFrame(interpolated_points_list)
interpolated_df = au.reverse_map_categories(interpolated_df)
interpolated_df = au.apply_conceptual_constraints(interpolated_df)
# Create placeholder columns in interpolated_df that exist in plot_df but not in interpolated_df
#for col in plot_df.columns:
#    if col not in interpolated_df.columns and col not in pc_cols:
#         interpolated_df[col] = 'Interpolated' # or np.nan
print("\nGenerated Interpolated Points (showing a few key derived dimensions):")
print(interpolated_df.columns)
display(interpolated_df[['Experiment', 'Task 2 Response Probability', 'Task 2 Difficulty is NA',
                         'Task 2 Difficulty', 'Task 2 Stimulus-Response Mapping Mapped', 'Task 2 Stimulus-Response Mapping',
                         #'Stimulus-Stimulus Congruency',
                         'Stimulus Bivalence & Congruency',
                         'Inter-task SOA', 'Distractor SOA is NA', 'Switch Rate']].round(2))
display(interpolated_df[['Experiment'] + [c for c in interpolated_df.columns if not c.startswith("PC") and c != 'Experiment']].round(2))

# --- 4. Combine all DataFrames into one for plotting ---
final_plot_df = pd.concat([plot_df, centroids_df, interpolated_df], ignore_index=True)
final_plot_df['Parent1'] = final_plot_df['Parent1'].fillna('N/A')
final_plot_df['Parent2'] = final_plot_df['Parent2'].fillna('N/A')

# Undo the imputation on columns that were originally 'N/A'
final_plot_df = add_na_mask_from_flag(final_plot_df)


Calculated Centroids in PC Space:


Unnamed: 0,Paradigm,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,PC41,PC42,Point Type
0,Dual-Task_PRP,-1.040697,2.446748,1.356886,-0.407252,0.653377,-0.586366,0.222774,0.162985,-0.036109,0.018553,-0.049873,-0.042375,0.007816,-0.002213,0.066131,-0.027469,0.024828,0.013409,0.059025,0.021004,0.0221,0.00544,-0.000633,-0.000129,0.001777,0.001429,2.664535e-16,-2.245889e-15,3.811766e-16,-5.245804e-16,2.183439e-16,1.757853e-16,-3.9551700000000005e-17,4.274359e-16,3.122502e-16,-2.969847e-16,-3.404684e-16,1.998401e-16,2.393918e-16,-3.299858e-17,2.757054e-16,1.184238e-16,Centroid
1,Interference,-1.807626,-1.438675,-0.289766,0.062537,-0.044792,0.073438,-0.159804,0.070369,-0.244353,0.014516,-0.04025,-0.093151,0.000605,0.036876,0.105033,-0.042494,0.05664,0.025642,0.037556,-0.029512,-0.020602,-0.011113,-0.001827,0.00915,0.005434,0.000809,5.3148970000000005e-17,-1.68748e-16,2.078715e-16,-2.81099e-16,3.259804e-16,-6.377877000000001e-17,3.16384e-16,2.043283e-16,-2.881856e-16,-4.8424620000000006e-17,5.385763e-16,2.704692e-16,-2.684023e-16,-6.945956e-17,3.543265e-17,1.936985e-16,Centroid
2,Single-Task,-1.831934,-1.321452,-0.167275,-0.432202,0.059866,-0.077608,-0.079737,-0.627387,1.081917,0.205471,0.374866,0.224795,-0.028575,-0.081946,-0.263873,0.088059,-0.165447,-0.138646,-0.058314,0.134251,0.079294,0.043654,0.006104,-0.04557,-0.032015,0.003697,-4.465027e-17,-4.16937e-16,-1.73774e-16,-3.620292e-16,6.950962e-16,4.827057e-17,1.960992e-16,2.799693e-16,-1.188663e-16,-1.906687e-16,5.164951e-16,3.909916e-16,-2.434647e-16,-2.9257620000000003e-17,1.761876e-16,1.544658e-16,Centroid
3,Task Switching,1.595891,0.109442,-0.292604,0.165682,-0.211448,0.174789,0.020286,-0.011418,0.001463,-0.041881,-0.010732,0.03563,0.000764,-0.008423,-0.045185,0.02103,-0.017492,-0.000152,-0.033317,-0.009151,-0.007054,-0.001661,0.000403,0.001138,0.000691,-0.001435,-1.165089e-16,7.700559e-16,-2.052622e-16,-2.430227e-16,-1.49751e-16,1.258683e-16,-1.484601e-16,2.743284e-16,3.517858e-17,4.19561e-17,1.407143e-16,1.4200530000000002e-17,2.5798970000000002e-17,6.883986e-17,7.423003e-18,-1.303867e-16,Centroid



Generated Interpolated Points (showing a few key derived dimensions):
Index(['Task 2 Response Probability', 'Inter-task SOA', 'Distractor SOA',
       'Task 1 CSI', 'Task 2 CSI', 'RSI', 'Switch Rate', 'Task 1 Difficulty',
       'Task 2 Difficulty', 'Inter-task SOA is NA', 'Distractor SOA is NA',
       'Task 2 CSI is NA', 'Task 2 Difficulty is NA',
       'Response Set Overlap Mapped', 'RSI is Predictable',
       'Task 1 Stimulus-Response Mapping Mapped', 'Task 1 Cue Type Mapped',
       'Task 2 Stimulus-Response Mapping Mapped', 'Task 2 Cue Type Mapped',
       'Trial Transition Type Mapped', 'SBC_Mapped', 'PC1', 'PC2', 'PC3',
       'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12',
       'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21',
       'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30',
       'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39',
       'PC40', 'PC41', 'PC42', 'Point Type', 'Exper

Unnamed: 0,Experiment,Task 2 Response Probability,Task 2 Difficulty is NA,Task 2 Difficulty,Task 2 Stimulus-Response Mapping Mapped,Task 2 Stimulus-Response Mapping,Stimulus Bivalence & Congruency,Inter-task SOA,Distractor SOA is NA,Switch Rate
0,Interpolation: Dual-Task_PRP <-> Task Switching,0.5,0,2.68,SRM2_Arbitrary,Arbitrary,Incongruent,691.67,0,23.35
1,Interpolation: Dual-Task_PRP <-> Interference,0.5,1,2.74,SRM2_NA,,Incongruent,691.67,0,0.42
2,Interpolation: Task Switching <-> Interference,0.0,1,2.5,SRM2_NA,,Neutral,300.0,0,22.94


Unnamed: 0,Experiment,Task 2 Response Probability,Inter-task SOA,Distractor SOA,Task 1 CSI,Task 2 CSI,RSI,Switch Rate,Task 1 Difficulty,Task 2 Difficulty,Inter-task SOA is NA,Distractor SOA is NA,Task 2 CSI is NA,Task 2 Difficulty is NA,Response Set Overlap Mapped,RSI is Predictable,Task 1 Stimulus-Response Mapping Mapped,Task 1 Cue Type Mapped,Task 2 Stimulus-Response Mapping Mapped,Task 2 Cue Type Mapped,Trial Transition Type Mapped,SBC_Mapped,Point Type,Paradigm,Parent1,Parent2,Stimulus Bivalence & Congruency,Task 1 Stimulus-Response Mapping,Task 2 Stimulus-Response Mapping,Response Set Overlap,Trial Transition Type,Task 1 Cue Type,Task 2 Cue Type
0,Interpolation: Dual-Task_PRP <-> Task Switching,0.5,691.67,5.58,96.8,96.8,953.72,23.35,2.48,2.68,1,0,0,0,RSO_Identical,1,SRM_Compatible,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,Incongruent,Interpolated,Interpolated Point,Dual-Task_PRP,Task Switching,Incongruent,Compatible,Arbitrary,Identical,Pure,None/Implicit,None/Implicit
1,Interpolation: Dual-Task_PRP <-> Interference,0.5,691.67,7.02,59.57,-0.0,1365.63,0.42,2.66,2.74,1,0,1,1,RSO_NA,1,SRM_Compatible,TCT_Implicit,SRM2_NA,TCT2_NA,TTT_Pure,Incongruent,Interpolated,Interpolated Point,Dual-Task_PRP,Interference,Incongruent,Compatible,,,Pure,None/Implicit,
2,Interpolation: Task Switching <-> Interference,0.0,300.0,12.6,156.38,96.8,1429.69,22.94,2.4,2.5,1,0,1,1,RSO_NA,1,SRM_Arbitrary,TCT_Implicit,SRM2_NA,TCT2_NA,TTT_Pure,Neutral,Interpolated,Interpolated Point,Task Switching,Interference,Neutral,Arbitrary,,,Pure,None/Implicit,


### Plot Experimental Conditions

In [26]:
display(final_plot_df.head())
chart_layers = []

# Define the color scheme explicitly so we can reuse it
paradigm_colors = alt.Scale(domain=['Dual-Task_PRP', 'Interference', 'Task Switching', 'Single-Task', 'Other'], 
                            range=['#440154', '#34CBAF', '#CB3450', '#FFA500', '#cccccc'])

# --- 5. Define Tooltip Columns ---
tooltip_cols = [
    alt.Tooltip('Experiment:N', title='Experiment'),
    alt.Tooltip('Paradigm:N', title='Paradigm Class'),
    alt.Tooltip('Task 2 Response Probability:Q', title='T2 Response Probability', format='.2f'),
    alt.Tooltip('Inter-task SOA:N', title='Inter-task SOA'),
    alt.Tooltip('Distractor SOA:N', title='Distractor SOA'),
    alt.Tooltip('Task 1 CSI:Q', title='T1 CSI', format='.0f'),
    alt.Tooltip('Task 2 CSI:Q', title='T2 CSI', format='.0f'),
    alt.Tooltip('RSI:Q', title='RSI', format='.0f'),
    alt.Tooltip('RSI is Predictable:N', title='RSI Predictable'),
    alt.Tooltip('Switch Rate:Q', title='Switch Rate (%)', format='.1f'),
    alt.Tooltip('Trial Transition Type:N', title='Transition Type'),
    alt.Tooltip('Stimulus-Stimulus Congruency:N', title='Stimulus-Stimulus Congruency'),
    alt.Tooltip('Stimulus-Response Congruency:N', title='Stimulus-Response Congruency'),
    alt.Tooltip('Response Set Overlap:N', title='Response Set Overlap'),
    alt.Tooltip('Task 1 Stimulus-Response Mapping:N', title='T1 Stimulus-Response Mapping'),
    alt.Tooltip('Task 2 Stimulus-Response Mapping:N', title='T2 Stimulus-Response Mapping'),
    alt.Tooltip('Task 1 Difficulty:Q', title='T1 Difficulty', format='.1f'),
    alt.Tooltip('Task 2 Difficulty:Q', title='T2 Difficulty', format='.1f'),
    alt.Tooltip('Task 2 CSI is NA:N', title='T2 CSI is N/A'),
    alt.Tooltip('Task 2 Difficulty is NA:N', title='T2 Difficulty is N/A'),
    alt.Tooltip('Inter-task SOA is NA:N', title='Inter-task SOA is N/A'),
    alt.Tooltip('Distractor SOA is NA:N', title='Distractor SOA is N/A')
]

# --- 6. Create the Layered Altair Chart ---

# Layer 1: Centroids (large, transparent circles)
# Zip the domain and range to loop through them together
for category, hex_color in zip(paradigm_colors.domain, paradigm_colors.range):

    # 1. Convert the hex color to its RGB components
    r, g, b = hex_to_rgb(hex_color)

    # 2. Create the RGBA string for the gradient
    transparent_rgba = f'rgba({r}, {g}, {b}, 0)'
    # 3. Create the chart layer for this category
    layer = alt.Chart(final_plot_df).mark_circle(size=20000).encode(
        x='PC1:Q',
        y='PC2:Q',
        color=alt.value({
            "gradient": "radial",
            "stops": [
                {"offset": 0, "color": hex_color},          # The solid hex color at the center
                {"offset": 1, "color": transparent_rgba}    # The transparent version at the edge
            ]
        })
    ).transform_filter(
        (alt.datum.Paradigm == category) & (alt.datum['Point Type'] == 'Centroid') # Filter for the current category
    )

    chart_layers.append(layer)

# Layer 2: Empirical Data Points (standard circles)
empirical_chart = alt.Chart(final_plot_df).mark_circle(
    size=100,
    opacity=0.8
).encode(
    x=alt.X('PC1:Q', title='Principal Component 1 (Task-Set Dynamics: Sequential Switching vs. Concurrent Coordination)'),
    y=alt.Y('PC2:Q', title='Principal Component 2 (Task Set Size and Temporal Manipulations'),
    color=alt.Color('Paradigm:N', title='Paradigm Class', scale=paradigm_colors),
    tooltip=tooltip_cols
).transform_filter(
    alt.datum['Point Type'] == 'Empirical Data'
)
chart_layers.append(empirical_chart)

# Layer 3: Interpolated Points (large, black diamonds to stand out)
interpolated_chart = alt.Chart(final_plot_df).mark_point(
    size=400,
    shape='M0,.5L.6,.8L.5,.1L1,-.3L.3,-.4L0,-1L-.3,-.4L-1,-.3L-.5,.1L-.6,.8L0,.5Z',
    filled=True,
    strokeWidth=4 # Make the stroke thicker to be more visible
).encode(
    x=alt.X('PC1:Q'),
    y=alt.Y('PC2:Q'),
    # Use the 'Parent1' and 'Parent2' columns to drive the colors
    color=alt.Color('Parent1:N', title='Paradigm Class', scale=paradigm_colors, legend=None), # Fill color
    stroke=alt.Color('Parent2:N', title='Paradigm Class', scale=paradigm_colors, legend=None), # Stroke color
    tooltip=tooltip_cols
).transform_filter(
    alt.datum['Point Type'] == 'Interpolated'
)
chart_layers.append(interpolated_chart)

# --- 7. Add Zero Lines and Combine Layers ---
zero_line_h = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(strokeDash=[5,5], color='grey').encode(y='y')
zero_line_v = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(strokeDash=[5,5], color='grey').encode(x='x')
chart_layers.append(zero_line_h)
chart_layers.append(zero_line_v)

# Layer all the charts together. The order is important: bottom layers are drawn first.
final_layered_chart = alt.layer(
    #centroids_chart,
    *chart_layers
).properties(
    title='PCA of Cognitive Control Paradigms with Centroids and Interpolated Points',
    width=1100,
    height=900
).interactive()

final_layered_chart = final_layered_chart.resolve_scale(
    color='independent',
    stroke='independent'
).resolve_legend(
    color='shared'
)
final_layered_chart.save('pca_plot.json')
# Display the final chart
final_layered_chart

Unnamed: 0,Experiment,Number of Tasks,Task 2 Response Probability,Inter-task SOA,Distractor SOA,Task 1 CSI,Task 2 CSI,Switch Rate,Trial Transition Type,Stimulus-Stimulus Congruency,Stimulus-Response Congruency,Response Set Overlap,Task 1 Stimulus-Response Mapping,Task 2 Stimulus-Response Mapping,Task 1 Cue Type,Task 2 Cue Type,RSI is Predictable,RSI,Task 1 Difficulty,Task 2 Difficulty,Task 1 Type,Task 2 Type,Notes,Super_Experiment_Mapping_Notes,Task 1 Difficulty Norm,Task 2 Difficulty Norm,Paradigm,Inter-task SOA is NA,Distractor SOA is NA,Task 2 CSI is NA,Task 2 Difficulty is NA,Stimulus Bivalence & Congruency,SBC_Mapped,Response Set Overlap Mapped,Task 1 Stimulus-Response Mapping Mapped,Task 1 Cue Type Mapped,Task 2 Stimulus-Response Mapping Mapped,Task 2 Cue Type Mapped,Trial Transition Type Mapped,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,PC41,PC42,Point Type,Parent1,Parent2
0,Telford 1931 Auditory RT (500ms SOA),2.0,1.0,500.0,,0.0,0.0,0.0,Pure,,,Identical,Arbitrary,Arbitrary,None/Implicit,None/Implicit,1.0,1000.0,1.0,1.0,Auditory RT,Auditory RT,Both tasks were of the same task (i.e. two aud...,"{""param_overrides"": {""t1_stim_duration"":300, ""...",0.0,0.0,Dual-Task_PRP,0.0,1.0,0.0,0.0,,,RSO_Identical,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,0.513543,1.32238,-1.065945,-3.28166,2.245795,-1.316478,1.467607,0.568128,1.622804,-0.257528,-0.37649,0.573416,0.182814,0.454522,0.692726,0.107377,0.365235,-0.158984,0.771305,0.468704,-0.162345,0.167027,-0.051996,-0.134883,0.05615,-0.124431,6.938894e-16,-2.955969e-15,-7.771561e-16,0.0,4.440892e-16,4.440892e-16,5.828671e-16,7.771561e-16,2.914335e-16,-1.110223e-16,0.0,3.330669e-16,1.595946e-16,1.077015e-16,7.21645e-16,-2.220446e-16,Empirical Data,,
1,Telford 1931 Auditory RT (1000ms SOA),2.0,1.0,1000.0,,0.0,0.0,0.0,Pure,,,Identical,Arbitrary,Arbitrary,None/Implicit,None/Implicit,1.0,1000.0,1.0,1.0,Auditory RT,Auditory RT,Both tasks were of the same task (i.e. two aud...,"{""param_overrides"": {""t1_stim_duration"":300, ""...",0.0,0.0,Dual-Task_PRP,0.0,1.0,0.0,0.0,,,RSO_Identical,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,0.485146,1.41238,-0.994951,-3.443063,2.38485,-0.98357,1.388661,0.438921,1.523736,-0.201517,-0.390652,0.543954,0.173942,0.468132,0.698883,0.12339,0.383217,-0.121462,0.74004,0.442575,-0.148425,0.162884,-0.053573,-0.132843,0.056738,-0.120432,7.494005e-16,-2.955969e-15,-5.551115e-16,0.0,4.440892e-16,4.440892e-16,5.828671e-16,7.771561e-16,2.914335e-16,-1.110223e-16,0.0,3.330669e-16,1.595946e-16,9.188122e-17,7.21645e-16,-2.220446e-16,Empirical Data,,
2,Telford 1931 Auditory RT (2000ms SOA),2.0,1.0,2000.0,,0.0,0.0,0.0,Pure,,,Identical,Arbitrary,Arbitrary,None/Implicit,None/Implicit,1.0,1000.0,1.0,1.0,Auditory RT,Auditory RT,Both tasks were of the same task (i.e. two aud...,"{""param_overrides"": {""t1_stim_duration"":300, ""...",0.0,0.0,Dual-Task_PRP,0.0,1.0,0.0,0.0,,,RSO_Identical,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,0.428352,1.592381,-0.852963,-3.765869,2.66296,-0.317755,1.230771,0.180507,1.3256,-0.089493,-0.418976,0.485028,0.156198,0.495353,0.711198,0.155416,0.419181,-0.04642,0.677511,0.390316,-0.120585,0.154599,-0.056729,-0.128763,0.057915,-0.112433,8.049117e-16,-2.844947e-15,-5.551115e-16,0.0,4.440892e-16,5.551115e-16,5.828671e-16,7.771561e-16,2.914335e-16,-1.110223e-16,0.0,5.551115e-16,2.706169e-16,6.024066000000001e-17,7.21645e-16,-2.220446e-16,Empirical Data,,
3,Telford 1931 Auditory RT (4000ms SOA),2.0,1.0,4000.0,,0.0,0.0,0.0,Pure,,,Identical,Arbitrary,Arbitrary,None/Implicit,None/Implicit,1.0,1000.0,1.0,1.0,Auditory RT,Auditory RT,Both tasks were of the same task (i.e. two aud...,"{""param_overrides"": {""t1_stim_duration"":300, ""...",0.0,0.0,Dual-Task_PRP,0.0,1.0,0.0,0.0,,,RSO_Identical,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,0.314764,1.952383,-0.568986,-4.41148,3.219179,1.013875,0.91499,-0.336321,0.929328,0.134554,-0.475624,0.367177,0.12071,0.549794,0.735829,0.219468,0.49111,0.103666,0.552452,0.285798,-0.064904,0.138029,-0.06304,-0.120602,0.060269,-0.096434,9.15934e-16,-2.733924e-15,-1.110223e-16,1.110223e-16,4.440892e-16,6.661338e-16,5.828671e-16,7.771561e-16,4.024558e-16,-1.110223e-16,1.110223e-16,7.771561e-16,4.371503e-16,-3.040475e-18,9.436896e-16,-2.220446e-16,Empirical Data,,
4,Telford 1931 Line Length (1000ms SOA),2.0,1.0,1000.0,,0.0,0.0,0.0,Pure,,,Identical,Arbitrary,Arbitrary,None/Implicit,None/Implicit,1.0,1000.0,3.0,3.0,Line Length Comparison,Line Length Comparison,Both tasks were of the same task (i.e. two len...,,0.5,0.5,Dual-Task_PRP,0.0,1.0,0.0,0.0,,,RSO_Identical,SRM_Arbitrary,TCT_Implicit,SRM2_Arbitrary,TCT2_Implicit,TTT_Pure,-1.280335,2.714599,1.476719,-0.734538,0.264094,-0.043762,0.116781,0.813897,0.693555,-0.524953,0.223624,0.309947,0.092497,-0.252834,-0.168213,-0.262646,-0.322628,-0.564205,0.40902,0.369062,-0.223825,0.053892,0.034856,-0.016227,-0.014648,-0.047421,2.498002e-16,-2.733924e-15,-1.110223e-16,-2.220446e-16,4.440892e-16,3.330669e-16,-8.326673e-17,7.771561e-16,4.024558e-16,-5.551115e-16,-3.330669e-16,3.330669e-16,2.706169e-16,-5.47781e-17,5.5511150000000004e-17,-2.220446e-16,Empirical Data,,


In [21]:
# --- 5. Define Tooltip Columns ---
tooltip_cols = [
    alt.Tooltip('Experiment:N', title='Experiment'),
    alt.Tooltip('Paradigm:N', title='Paradigm Class'),
    alt.Tooltip('Task 2 Response Probability:Q', title='T2 Response Probability', format='.2f'),
    alt.Tooltip('Inter-task SOA:N', title='Inter-task SOA'),
    alt.Tooltip('Distractor SOA:N', title='Distractor SOA'),
    alt.Tooltip('Task 1 CSI:Q', title='T1 CSI', format='.0f'),
    alt.Tooltip('Task 2 CSI:Q', title='T2 CSI', format='.0f'),
    alt.Tooltip('RSI:Q', title='RSI', format='.0f'),
    alt.Tooltip('RSI is Predictable:N', title='RSI Predictable'),
    alt.Tooltip('Switch Rate:Q', title='Switch Rate (%)', format='.1f'),
    alt.Tooltip('Trial Transition Type:N', title='Transition Type'),
    alt.Tooltip('Stimulus-Stimulus Congruency:N', title='Stimulus-Stimulus Congruency'),
    alt.Tooltip('Stimulus-Response Congruency:N', title='Stimulus-Response Congruency'),
    alt.Tooltip('Stimulus Bivalence & Congruency:N', title='Stimulus Bivalence & Congruency'),
    alt.Tooltip('Response Set Overlap:N', title='Response Set Overlap'),
    alt.Tooltip('Task 1 Stimulus-Response Mapping:N', title='T1 Stimulus-Response Mapping'),
    alt.Tooltip('Task 2 Stimulus-Response Mapping:N', title='T2 Stimulus-Response Mapping'),
    alt.Tooltip('Task 1 Difficulty:Q', title='T1 Difficulty', format='.1f'),
    alt.Tooltip('Task 2 Difficulty:Q', title='T2 Difficulty', format='.1f'),
    alt.Tooltip('Task 2 CSI is NA:N', title='T2 CSI is N/A'),
    alt.Tooltip('Task 2 Difficulty is NA:N', title='T2 Difficulty is N/A'),
    alt.Tooltip('Inter-task SOA is NA:N', title='Inter-task SOA is N/A'),
    alt.Tooltip('Distractor SOA is NA:N', title='Distractor SOA is N/A')
]
# --- 1. Combine data and calculate centroids ---
plot_df = pd.concat([
    df_processed.reset_index(drop=True),
    pca_df.reset_index(drop=True)
], axis=1)
plot_df['Point Type'] = 'Empirical Data'

centroids_df = au.find_centroids(plot_df[[c for c in plot_df.columns if c.startswith("PC")] + ["Paradigm"]], paradigm_col='Paradigm')
centroids_df = pd.DataFrame.from_dict(centroids_df, orient='index').reset_index()
centroids_df = centroids_df.rename(columns={'index': 'Paradigm'})
centroids_df['Point Type'] = 'Centroid'

# --- 2. Generate interpolated points using the new utility function ---
interpolation_pairs = [
    ('Dual-Task_PRP', 'Task Switching'),
    ('Dual-Task_PRP', 'Interference'),
    ('Task Switching', 'Interference')
]

model_artifacts_pca = {'type': 'pca', 'pipeline': pipeline}

interpolated_df = au.generate_interpolated_points(
    latent_space_df=plot_df,
    model_artifacts=model_artifacts_pca,
    interpolation_pairs=interpolation_pairs
)

# --- 3. Combine all dataframes for plotting ---
final_plot_df = pd.concat([plot_df, centroids_df, interpolated_df], ignore_index=True)
final_plot_df['Parent1'] = final_plot_df['Parent1'].fillna('N/A')
final_plot_df['Parent2'] = final_plot_df['Parent2'].fillna('N/A')
final_plot_df = plot.add_na_mask_from_flag(final_plot_df)

# Now, call the plotting function from plot.py
# (Assuming you have also moved the plotting logic there)
pca_chart = plot.create_latent_space_plot(final_plot_df, "PC1", "PC2", "Principal Component 1", "Principal Component 2",
                                          "Cognitive Control Experimental Conditions", tooltip_cols,
                                          has_interpolation=True)
pca_chart

In [20]:
np.mean(au.get_loadings_sparseness(loadings[["PC1", "PC2", "PC3"]].to_numpy()))

np.float64(0.3224137603321791)