In [1]:
import pandas as pd
import os
import warnings
from statsmodels.formula.api import ols
import statsmodels.api as sm
warnings.filterwarnings('ignore')  # Suppress all warnings
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

### Load the data

In [8]:
# Define the path to the raw data file (Parquet format)
file_path = "raw/ambitus_0_15_log_24_07_2025.parquet"
# load the data into a DataFrame
df = pd.read_parquet(file_path)
# Standardize gender-related columns by converting text to lowercase
df['GR_Gender'] = df['GR_Gender'].str.lower()
df['Gender'] = df['Gender'].str.lower()

In [10]:
# Drop rows with missing values in Group_Sex or Year
df = df.dropna(subset=['GR_Gender', 'Year'])
# Standardize Group_Sex values to lowercase
df['Sex'] = df['Gender'].str.lower()
# Standardize Group_Sex values to lowercase
df['Group_Sex'] = df['GR_Gender'].str.lower()

### ANOVA test

In [13]:
# Define variables to test and corresponding feature names in the dataset
anova_targets = {
    'LOCO_TOT': 'Locomotion (Loco_TOT)',
    'LOCO_BEF': 'Locomotion frequency (LOCO_BEF)',
    'Expl_E_I_TOT_Loco_ratio': 'Exploration (Expl_E_I_TOT_Loco_ratio)',
    'Expl_E_I_BEF_Nr': 'Exploration frequency (Expl_E_I_BEF_Nr)',
    'L_C': 'Learning capacity (L_C)',
    'E_E': 'Effective exploration ratio (E_E)'
}

# Prepare results container
anova_results = []

# Run factorial ANOVA for each feature
for feature, description in anova_targets.items():
    formula = f"{feature} ~ C(Group) + C(Sex) + C(Year) + C(Group):C(Year) + C(Sex):C(Year)"
    model = ols(formula, data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_table["Feature"] = description
    anova_table["Variable"] = anova_table.index
    anova_results.append(anova_table.reset_index(drop=True))

In [14]:
# Concatenate all results
final_anova_df = pd.concat(anova_results, ignore_index=True)

In [15]:
# Make a copy of the original ANOVA result table
df_anova = final_anova_df.copy()

# Map original variable names to clean labels
label_map = {
    'C(Group)': 'Gr',
    'C(Sex)': 'Sex',
    'C(Year)': 'Year',
    'C(Group):C(Year)': 'Gr/Y',
    'C(Sex):C(Year)': 'Sex/Y'
}

# Filter to include only the relevant ANOVA terms
df_anova = df_anova[df_anova['Variable'].isin(label_map.keys())]

# Create a new column for simplified effect labels
df_anova['Effect'] = df_anova['Variable'].map(label_map)

# Format the F and p-values into a readable string
df_anova['F(p)'] = df_anova.apply(
    lambda row: f"{row['F']:.2f} (p < {row['PR(>F)']:.4f})", axis=1
)

# Pivot the table so each effect is a column and each feature is a row
table_formatted = df_anova.pivot(index='Feature', columns='Effect', values='F(p)')

# Reorder columns to match desired output
ordered_cols = ['Gr', 'Sex', 'Year', 'Gr/Y', 'Sex/Y']
table_formatted = table_formatted.reindex(columns=ordered_cols)

In [18]:
display(table_formatted)

Effect,Gr,Sex,Year,Gr/Y,Sex/Y
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Effective exploration ratio (E_E),102.58 (p < 0.0000),3.69 (p < 0.0547),19.32 (p < 0.0000),12.56 (p < 0.0000),3.00 (p < 0.0063)
Exploration (Expl_E_I_TOT_Loco_ratio),5.04 (p < 0.0247),28.59 (p < 0.0000),34.75 (p < 0.0000),3.67 (p < 0.0012),2.18 (p < 0.0424)
Exploration frequency (Expl_E_I_BEF_Nr),67.93 (p < 0.0000),25.22 (p < 0.0000),34.10 (p < 0.0000),12.19 (p < 0.0000),2.76 (p < 0.0112)
Learning capacity (L_C),261.45 (p < 0.0000),2.80 (p < 0.0943),28.40 (p < 0.0000),11.74 (p < 0.0000),2.06 (p < 0.0545)
Locomotion (Loco_TOT),147.16 (p < 0.0000),94.49 (p < 0.0000),14.02 (p < 0.0000),12.63 (p < 0.0000),1.70 (p < 0.1163)
Locomotion frequency (LOCO_BEF),37.25 (p < 0.0000),44.34 (p < 0.0000),22.36 (p < 0.0000),2.96 (p < 0.0069),5.35 (p < 0.0000)


### Experiment with ANOVA Models

-> Playground cell <-

This cell allows you to experiment with different ANOVA models by changing:
- target_variable: The behavioral feature to analyze (e.g. L_C, EXPL_TOT, E_E, etc.)
- formula: The structure of the model, including main effects and interactions

Suggestions:
- Try adding/removing predictors like C(Season) or C(Paradigm)
- Test interaction terms, e.g. C(Group):C(Sex)
- Replace the outcome variable with another metric from the dataset

Some combinations may result in collinearity or missing data. Use dropna() as needed:
df_clean = df.dropna(subset=[target_variable, 'Group', 'Sex', 'Year'])


In [22]:
# Example: Customize ANOVA feature and formula here

target_variable = 'LOCO_TOT'  # You can change this to any numeric behavioral feature
formula = f"{target_variable} ~ C(Group) + C(Sex) + C(Year) + C(Group):C(Year)"

# Fit the model using ordinary least squares (OLS)
model = ols(formula, data=df).fit()

# Compute the ANOVA table
anova_table = sm.stats.anova_lm(model, typ=2)
display(anova_table)


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Group),12314.133755,1.0,145.685555,4.0806360000000004e-33
C(Sex),7980.842955,1.0,94.419434,3.8860370000000003e-22
C(Year),7104.719036,6.0,14.009037,6.971234e-16
C(Group):C(Year),6176.4504,6.0,12.178683,1.193428e-13
Residual,452464.60999,5353.0,,
