In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil import relativedelta

DATA_PATH = Path('/home/engaclew/neurogen')
results_folder = Path('/home/engaclew/neurogen/results/pyannote_metrics')

# Load ider
ider_lena = pd.read_csv(results_folder / 'its_eaf_an1' / 'ider_2mn_clips.csv')
ider_vtc = pd.read_csv(results_folder / 'vtc_eaf_an1' / 'ider_2mn_clips.csv')

# Order
desired_order = ['low_risk', 'angelman_syndrome', 'fragile_x_syndrome', 'down_syndrome', 'autism_sibling']
ider_lena['group_id'] = pd.Categorical(ider_lena['group_id'], categories=desired_order, ordered=True)
ider_vtc['group_id'] = pd.Categorical(ider_vtc['group_id'], categories=desired_order, ordered=True)
ider_lena = ider_lena.sort_values(['group_id', 'recording_id']).reset_index(drop=True)
ider_vtc = ider_vtc.sort_values(['group_id', 'recording_id']).reset_index(drop=True)

# Read metadata
children = pd.read_csv(DATA_PATH / 'data/L3_HIPAA_LENA_cleaned/metadata/children.csv')
recordings = pd.read_csv(DATA_PATH / 'data/L3_HIPAA_LENA_cleaned/metadata/recordings.csv')
recordings_data = recordings.merge(children, on='child_id')[['group_id', 'date_iso', 'recording_filename', 'child_sex', 'child_dob', 'child_id']]
def diff_month(row):
    d1 = datetime.strptime(row['date_iso'], '%Y-%m-%d')
    d2 = datetime.strptime(row['child_dob'], '%Y-%m-%d')
    return (d1.year - d2.year) * 12 + d1.month - d2.month
recordings_data['age'] = recordings_data.apply(lambda row: diff_month(row), axis=1)


ider_lena = ider_lena.merge(recordings_data, how='left', left_on='recording_id', right_on='recording_filename', suffixes=('', '_y'))
ider_vtc = ider_vtc.merge(recordings_data, how='left', left_on='recording_id', right_on='recording_filename', suffixes=('', '_y'))

def compute_ider(ider_data):
    cols = ['missed detection', 'false alarm', 'confusion', 'correct']
    for col in cols:
        ider_data[col.replace(' ', '_')] = 100*ider_data[col]/ider_data['total']
    ider_data['ider'] *= 100
    return ider_data

ider_lena = compute_ider(ider_lena)
ider_vtc = compute_ider(ider_vtc)
nan_rows = pd.isna(ider_lena['confusion'])
ider_lena = ider_lena[~nan_rows]
ider_vtc = ider_vtc[~nan_rows]

In [3]:
ider_lena

Unnamed: 0,recording_id,onset,offset,group_id,missed detection,total,confusion,correct,false alarm,ider,group_id_y,date_iso,recording_filename,child_sex,child_dob,child_id,age,missed_detection,false_alarm
0,20180530_181655_022873.wav,39778000,39898000,low_risk,10330.0,56624.0,19.613591,62.143261,11356.0,57.911840,low_risk,2018-05-21,20180530_181655_022873.wav,f,2016-08-21,3321,21,18.243148,20.055100
1,20180530_181655_022873.wav,14868000,14988000,low_risk,3437.0,50467.0,1.426675,91.762934,19320.0,46.519508,low_risk,2018-05-21,20180530_181655_022873.wav,f,2016-08-21,3321,21,6.810391,38.282442
2,20180530_181655_022873.wav,7296000,7416000,low_risk,15659.0,42621.0,9.478895,53.781000,3688.0,54.872011,low_risk,2018-05-21,20180530_181655_022873.wav,f,2016-08-21,3321,21,36.740105,8.653011
3,20180530_181655_022873.wav,18264000,18384000,low_risk,13855.0,49851.0,15.271509,56.935668,6854.0,56.813304,low_risk,2018-05-21,20180530_181655_022873.wav,f,2016-08-21,3321,21,27.792823,13.748972
4,20180530_181655_022873.wav,2202000,2322000,low_risk,6784.0,25841.0,11.671375,62.075771,7823.0,68.197825,low_risk,2018-05-21,20180530_181655_022873.wav,f,2016-08-21,3321,21,26.252854,30.273596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,20230405_090552_045738_2.wav,48958000,49078000,autism_sibling,18698.0,48896.0,15.899051,45.860602,3162.0,60.606185,autism_sibling,2023-03-26,20230405_090552_045738_2.wav,m,2021-07-13,7281,20,38.240347,6.466787
745,20230405_090552_045738_2.wav,16289000,16409000,autism_sibling,6567.0,56947.0,8.016226,80.451999,16330.0,48.223787,autism_sibling,2023-03-26,20230405_090552_045738_2.wav,m,2021-07-13,7281,20,11.531775,28.675786
747,20230405_090552_045738_2.wav,1639000,1759000,autism_sibling,15724.0,61253.0,6.794769,67.534651,6701.0,43.405221,autism_sibling,2023-03-26,20230405_090552_045738_2.wav,m,2021-07-13,7281,20,25.670579,10.939872
748,20230405_090552_045738_2.wav,5817000,5937000,autism_sibling,6560.0,27349.0,17.591137,58.422611,7391.0,68.602143,autism_sibling,2023-03-26,20230405_090552_045738_2.wav,m,2021-07-13,7281,20,23.986252,27.024754


In [2]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

def print_full_mixed_effects_results(data, dvs):
    """
    Print mixed effects results table for multiple dependent variables.
    """
    warnings.filterwarnings('ignore', category=ConvergenceWarning)
    
    print("\nTable X. Mixed Effects Analysis Results")
    print("-" * 80)
    print(f"{'Dependent Variable':<20} {'Predictor':<20} {'z':>8} {'p':>10} {'η²p':>8}")
    print("-" * 80)
    
    for dv in dvs:
        # Fit model
        model = smf.mixedlm(
            f"{dv} ~ group_id + age + child_sex", 
            data=data,
            groups="child_id"
        ).fit()
        
        # Get results table
        results = model.summary().tables[1]
        
        # Process results (skip the last row which is for random effects)
        for idx in range(len(results)-1):
            name = results.index[idx]
            z_stat = float(results.iloc[idx, 2])
            p_val = float(results.iloc[idx, 3])
            
            # Clean up predictor names
            clean_name = (name.replace('group_id[T.', '')
                            .replace('child_sex[T.', '')
                            .replace(']', '')
                            .replace('_', ' '))
            
            # Calculate partial eta-squared
            df_resid = model.df_resid
            eta_sq = (z_stat**2) / (z_stat**2 + df_resid)
            
            # Format p-value
            if p_val < 0.001:
                p_value = "< .001***"
            else:
                p_value = f"{p_val:.3f}"
                if p_val < 0.01:
                    p_value += "**"
                elif p_val < 0.05:
                    p_value += "*"
            
            print(f"{dv:<20} {clean_name:<20} {z_stat:>8.2f} {p_value:>10} {eta_sq:>8.3f}")
        
        print()
    
    print("-" * 80)
    print("Note: η²p = partial eta-squared")
    print("* p < .05, ** p < .01, *** p < .001")

# Example usage:
dvs = ['confusion', 'missed_detection', 'false_alarm', 'correct']
print('LENA')
print_full_mixed_effects_results(ider_lena, dvs)
print('ACLEW')
print_full_mixed_effects_results(ider_vtc, dvs)

LENA

Table X. Mixed Effects Analysis Results
--------------------------------------------------------------------------------
Dependent Variable   Predictor                   z          p      η²p
--------------------------------------------------------------------------------
confusion            Intercept                2.00     0.046*    0.007
confusion            angelman syndrome        0.12      0.902    0.000
confusion            fragile x syndrome       0.90      0.366    0.001
confusion            down syndrome           -0.57      0.572    0.001
confusion            autism sibling           1.71      0.087    0.005
confusion            m                       -0.02      0.985    0.000
confusion            age                      0.98      0.327    0.002

missed_detection     Intercept                5.45  < .001***    0.049
missed_detection     angelman syndrome        0.91      0.362    0.001
missed_detection     fragile x syndrome      -0.24      0.813    0.000
missed_det