In [1]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm



base_folder_path = Path('/data/storage025/wavs_single_channel_normalized_nosil/')

# np.set_printoptions(threshold=np.inf)




def get_group_id(filename):
    filename = os.path.basename(filename)
    group_id = filename.split('_')[0][-4:-2]
    if group_id not in ['11', '21', '22']:
        raise ValueError(f"Invalid group id {group_id}")
    return group_id

def add2list(group_id, feature, ls):
    if group_id == '11':
        ls[0].append(feature)
    elif group_id == '21':
        ls[1].append(feature)
    elif group_id == '22':
        ls[2].append(feature)
    else:
        print(f'Invalid group id {group_id}')
        
       
       
def load_f0():
    feature_name = 'f0'
    
    # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    # f0_diff_list = open('f0_diff_list.txt', 'w')
    # non_f0_list = open('non_f0_list.txt', 'w')
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
    # for folder in ['BoundaryTone']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')
        

        cnt = 0
        cnt2 = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            if np.max(feature) < 60:
                print(f'max value < 60 in {npy_file}')
                cnt2 += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')

                continue
            
            group_id = get_group_id(npy_file.stem)
            feature = feature[feature != 0]
            feature = feature[feature < 500.0]
            if feature.shape[0] == 0:
                print(f'All value larger than 500.0 in {npy_file}')
                continue

            add2list(group_id, (npy_file.stem, np.log(np.mean(feature))), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        print(f'{cnt2} files with max value < 60Hz')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    # print(np.sort(avg_diff)[-800])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
        
        
    return all3, exp2lists


def load_utt_feat(feature_name='energy', stats='mean'):
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)         
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                continue
            
            group_id = get_group_id(npy_file.stem)
            feature = feature[feature != 0]
            if feature_name == 'f0':
                feature = feature[feature < 500.0]
                feature = np.log(feature)
                if feature.shape[0] == 0:
                    print(f'All value larger than 500.0 in {npy_file}')
                    continue
            if stats == 'mean':
                feature = np.mean(feature)
            elif stats == 'std':
                feature = np.std(feature)
            add2list(group_id, (npy_file.stem, feature), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]


        
    return all3, exp2lists
    

def load_rp():
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_list = os.path.join('/home/yzhong/gits/TurnTakingPD/filelists', 'clean_id_responsetime_' + folder + '_filtered.txt')
        
        cnt = 0
        with open(feature_list, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                rp = float(line.strip().split('\t')[-1])
                basename = line.strip().split('\t')[0]
                group_id = get_group_id(basename)
                add2list(group_id, (basename, rp), exp2lists[folder])
                cnt += 1
                
        print(f'Processing {feature_list} ...')
        print(f'Found {cnt}')


    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
    
    #     # deep copy BD
    # all3_final = [[], [], []]

    # for i in range(3):
    #     tmp = np.array(all3[i])
    #     # remove all 0 value
    #     tmp = tmp[tmp != 0]    
    #     # remove value > 500
    #     # tmp = tmp[tmp < 500.0]
    #     # print(tmp.shape)
    #     all3_final[i] = tmp
        
    #     tmp = np.array(exp2lists['BoundaryTone'][i])
    #     tmp = tmp[tmp != 0]
    #     exp2lists['BoundaryTone'][i] = tmp

    #     tmp = np.array(exp2lists['EarlyLate'][i])
    #     tmp = tmp[tmp != 0]
    #     exp2lists['EarlyLate'][i] = tmp

    #     tmp = np.array(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
    #     tmp = tmp[tmp != 0]
    #     exp2lists['PictureNaming'][i] = tmp
    
    return all3, exp2lists

all3_f0, exp2list_f0 = load_utt_feat(feature_name='f0')
all3_f0_var, exp2list_f0_var = load_utt_feat(feature_name='f0', stats='std')

all3_energy, exp2list_energy = load_utt_feat(feature_name='energy')
all3_energy_var, exp2list_energy_var = load_utt_feat(feature_name='energy', stats='std')

all3_rp, exp2list_rp = load_rp()
        


Processing BoundaryTone folder...
Found 5026 npy files
0 files with all 0 values
Processing EarlyLate folder...
Found 3971 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2112_41_L_gereedschap_tanden.wav_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2120_17_E_piept_knaagdier.wav_1.wav_f0.npy
0 files with all 0 values
Processing PictureNaming folder...
Found 1652 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2112_rups.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2202_schatkist.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2127_rups.png_1.wav_f0.npy
All value larger than 500.0 in /data/s

In [2]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

groups = ['YA', 'OA', 'PD']

def lme(exp2list):
    all_data = []

    for exp_idx, exp in enumerate(['PictureNaming', 'EarlyLate', 'BoundaryTone']):
        experiment_data = exp2list[exp]
        for group_idx, group_array in enumerate(experiment_data):
            if group_idx == 0:
                continue
            for value in group_array:
                utt = {
                    'experiment': 'exp_' + str(exp_idx + 1) + '_' + exp,
                    'group': groups[group_idx],
                    'value': value[1],
                    'subject_id': int(value[0].split('-')[1].split('_')[0])
                }
                all_data.append(utt)

    df = pd.DataFrame(all_data)

    model = smf.mixedlm("value ~ group + experiment + group:experiment", 
                        data=df, 
                        groups="subject_id")
                        

    result = model.fit()

    print(result.summary())

lme(exp2list_f0)
# lme(exp2list_f0_var)
# lme(exp2list_energy)
# lme(exp2list_energy_var)
# lme(exp2list_rp)

                         Mixed Linear Model Regression Results
Model:                       MixedLM            Dependent Variable:            value    
No. Observations:            8177               Method:                        REML     
No. Groups:                  55                 Scale:                         0.0116   
Min. group size:             55                 Log-Likelihood:                6413.4252
Max. group size:             156                Converged:                     Yes      
Mean group size:             148.7                                                      
----------------------------------------------------------------------------------------
                                             Coef.  Std.Err.    z    P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------
Intercept                                     4.996    0.038 130.351 0.000  4.921  5.071
group[T.PD]                                   0