In [7]:
import pandas as pd
from pymer4.models import Lmer

from load_feat_pd import *


def lme(metadata, outfile, featname='shimmer', level='utt', stats='mean'):

    df = pd.DataFrame(metadata)
    
    # remove group id == '11':
    df = df[df['group_id'] != '11']
    
    # map group_id 21 to HC 22 to PD
    df['group_id'] = df['group_id'].map({'21': 'HC', '22': 'PD'})


    # change age from string to int
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    missing_age = df['age'].isnull().sum()
    if missing_age > 0:
        print(f"warning 'age' has {missing_age} NaN, has been deleted")
        df = df.dropna(subset=['age'])
        
    # add one more column item name in the filename
    df['item'] = df['filename'].apply(lambda x: x.split('.')[0].split('_')[-1])
    
    # df['group_id'] = pd.Categorical(df['group_id'], categories=['PD', 'Control'])
    # df['gender'] = pd.Categorical(df['gender'], categories=['V', 'M'])
    df['experiment'] = pd.Categorical(df['experiment'], categories=[ 'exp_2_EarlyLate', 'exp_3_BoundaryTone','exp_1_PictureNaming'])

    # print("Group ID Levels:", df['group_id'].cat.categories)       # 输出: ['PD', 'Control']
    # print("Gender Levels:", df['gender'].cat.categories)           # 输出: ['Female', 'Male']
    print("Experiment Levels:", df['experiment'].cat.categories)   # 输出: ['Baseline', 'EarlyLate', 'BoundaryTone']

    print(df.head())


    if level == 'utt':
        # change the value from np array to float value
        df['value'] = df['value'].apply(lambda x: x[0])
        
    elif level == 'frame':
        if stats == 'mean':
            df['value'] = df['value'].apply(lambda x: x.mean())
        elif stats == 'std':
            df['value'] = df['value'].apply(lambda x: x.std())
        elif stats == 'median':
            df['value'] = df['value'].apply(lambda x: np.median(x))

    # print(df.head())
    # print(df.shape)
    featname = "response_time" if featname == 'rp' else featname
    valuetype = stats if level == 'frame' else ''

    print(f'=============== value: {featname}_{valuetype} ====================\n')


    model = Lmer("value ~ age + group_id*experiment + group_id*gender + (1|subject_id) + (1|item)", data=df)
                        

    result = model.fit()

    
        
    with open(outfile, 'a') as f:
        f.write(f'\n')
        f.write(f'=============== value: {featname}_{valuetype} ====================\n')
        f.write(result[['Estimate', 'SE', 'T-stat', 'P-val', 'Sig']].to_string())    
        f.write(f'\n')
        f.write(model.ranef_var.to_string())
        f.write(f'\n')
    
    print(result[['Estimate', 'SE', 'T-stat', 'P-val', 'Sig']])


        
if __name__ == '__main__':
    base_folder_path_unnorm = Path('/data/storage025/Turntaking/wavs_single_channel_nosil')
    base_folder_path = Path('/data/storage025/Turntaking/wavs_single_channel_normalized_nosil')

    feats2level = {
        'jitter': 'utt',
        'shimmer': 'utt',
        'rp': 'utt',
        'f0': 'frame',
        'energy': 'frame'
    }

    np.set_printoptions(precision=2)
    allfeats = ['jitter', 'shimmer', 'rp', 'f0', 'energy']
    # for feat in allfeats:
    # allfeats = ['shimmer']
    
    outfile = './res_v1_Earlylate.txt'
    # if exist, remove the file
    if os.path.exists(outfile):
        os.remove(outfile)
    
    for featname in allfeats:
        folder = base_folder_path_unnorm if featname == 'energy' else base_folder_path

        if feats2level[featname] == 'frame':
            metadata = load_feat(folder, feature_name=featname)

            lme(metadata, outfile, featname=featname, level=feats2level[featname], stats='mean')
            lme(metadata, outfile, featname=featname, level=feats2level[featname], stats='std')
        else:
            metadata = load_feat(folder, feature_name=featname)
            lme(metadata, outfile, featname=featname, level=feats2level[featname])




Processing PictureNaming folder...
Found 1652 npy files
0 files with all 0 values
Processing EarlyLate folder...
Found 3971 npy files
nan in /data/storage025/Turntaking/wavs_single_channel_normalized_nosil/EarlyLate-features/jitter/subj-2120_27_E_kwaad_geloof.wav_1.wav_jitter.npy
0 files with all 0 values
Processing BoundaryTone folder...
Found 5026 npy files
0 files with all 0 values
Experiment Levels: Index(['exp_2_EarlyLate', 'exp_3_BoundaryTone', 'exp_1_PictureNaming'], dtype='object')
            experiment group_id                   value subject_id  \
0  exp_1_PictureNaming       HC  [0.021579830749940257]       2126   
1  exp_1_PictureNaming       HC  [0.030827921605807555]       2104   
2  exp_1_PictureNaming       HC  [0.015053068922943754]       2129   
3  exp_1_PictureNaming       PD  [0.026192795152750557]       2214   
4  exp_1_PictureNaming       HC   [0.02136526112037015]       2118   

                 filename          item   age gender moca education  
0  subj-2126_r

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7762	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: 26460.619 	 AIC: -52897.237

Random effects:

                   Name  Var    Std
item        (Intercept)  0.0  0.004
subject_id  (Intercept)  0.0  0.004
Residual                 0.0  0.008

No random effect correlations specified

Fixed effects:

                                          Estimate     SE  T-stat  P-val Sig
(Intercept)                                  0.023  0.008   3.030  0.004  **
age                                          0.000  0.000   0.936  0.354    
group_idPD                                  -0.002  0.002  -1.153  0.255    
experimentexp_3_BoundaryTone                 0.000  0.003   0.055  0.956    
experimentexp_1_PictureNaming               -0.002  0.001  -2.972  0.003  **
genderV                            

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7762	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: 17066.807 	 AIC: -34109.613

Random effects:

                   Name    Var    Std
item        (Intercept)  0.000  0.014
subject_id  (Intercept)  0.000  0.012
Residual                 0.001  0.026

No random effect correlations specified

Fixed effects:

                                          Estimate     SE  T-stat  P-val  Sig
(Intercept)                                  0.102  0.022   4.680  0.000  ***
age                                          0.000  0.000   1.002  0.321     
group_idPD                                  -0.000  0.005  -0.072  0.943     
experimentexp_3_BoundaryTone                -0.006  0.010  -0.578  0.564     
experimentexp_1_PictureNaming               -0.005  0.003  -1.696  0.092    .
genderV              

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7763	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: -9565.135 	 AIC: 19154.269

Random effects:

                   Name    Var    Std
item        (Intercept)  0.222  0.472
subject_id  (Intercept)  0.098  0.313
Residual                 0.643  0.802

No random effect correlations specified

Fixed effects:

                                          Estimate     SE  T-stat  P-val  Sig
(Intercept)                                 -0.917  0.584  -1.569  0.123     
age                                          0.029  0.008   3.613  0.001  ***
group_idPD                                   0.259  0.133   1.946  0.057    .
experimentexp_3_BoundaryTone                -0.055  0.338  -0.162  0.872     
experimentexp_1_PictureNaming               -0.013  0.101  -0.131  0.896     
genderV               

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7757	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: -32553.385 	 AIC: 65130.770

Random effects:

                   Name      Var     Std
item        (Intercept)   26.559   5.154
subject_id  (Intercept)  296.152  17.209
Residual                 244.491  15.636

No random effect correlations specified

Fixed effects:

                                          Estimate      SE  T-stat  P-val  Sig
(Intercept)                                105.195  31.405   3.350  0.002   **
age                                          0.273   0.435   0.628  0.533     
group_idPD                                  -2.514   7.064  -0.356  0.724     
experimentexp_3_BoundaryTone                -0.530   3.715  -0.143  0.887     
experimentexp_1_PictureNaming                2.755   1.223   2.254  0.026    *
gen

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7757	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: -29262.215 	 AIC: 58548.430

Random effects:

                   Name      Var     Std
item        (Intercept)   11.229   3.351
subject_id  (Intercept)   20.394   4.516
Residual                 105.706  10.281

No random effect correlations specified

Fixed effects:

                                          Estimate     SE  T-stat  P-val  Sig
(Intercept)                                 11.636  8.368   1.391  0.171     
age                                          0.089  0.116   0.767  0.447     
group_idPD                                  -3.717  1.903  -1.953  0.057    .
experimentexp_3_BoundaryTone                 4.651  2.416   1.925  0.057    .
experimentexp_1_PictureNaming               -0.369  0.797  -0.462  0.644     
genderV  

  ran_vars = ran_vars.applymap(


Linear mixed model fit by REML [’lmerMod’]
Formula: value~age+group_id*experiment+group_id*gender+(1|subject_id)+(1|item)

Family: gaussian	 Inference: parametric

Number of observations: 7763	 Groups: {'item': 140.0, 'subject_id': 51.0}

Log-likelihood: -6681.807 	 AIC: 13387.615

Random effects:

                   Name    Var    Std
item        (Intercept)  0.019  0.137
subject_id  (Intercept)  0.350  0.592
Residual                 0.310  0.556

No random effect correlations specified

Fixed effects:

                                          Estimate     SE  T-stat  P-val  Sig
(Intercept)                                  0.517  1.080   0.479  0.634     
age                                         -0.002  0.015  -0.120  0.905     
group_idPD                                   0.174  0.243   0.714  0.479     
experimentexp_3_BoundaryTone                -0.044  0.100  -0.441  0.660     
experimentexp_1_PictureNaming                0.246  0.036   6.863  0.000  ***
genderV               

  ran_vars = ran_vars.applymap(
