In [20]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm



base_folder_path = Path('/data/storage025/wavs_single_channel_normalized_nosil/')

demo_data_file = '/home/yzhong/gits/TurnTakingPD/demogr_perpp.txt'
ID2EMO = {}
with open(demo_data_file, 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        datas = line.split('\t')
        ID2EMO[datas[0]] = datas[1:]
        

# np.set_printoptions(threshold=np.inf)


def get_demo(filename):
    subject_id = filename.split('_')[0][-4:]
    group_id = subject_id[:2]
    if group_id not in ['11', '21', '22']:
        raise ValueError(f"Invalid group id {group_id}")
    if subject_id in {'2219', '2123'}:
        return None, None, None
    if subject_id in {'2135'}:
        return subject_id, group_id, ['NA', 'NA', 'NA', 'NA']
    return subject_id, group_id, ID2EMO[subject_id]


def add2list(group_id, feature, ls):
    if group_id == '11':
        ls[0].append(feature)
    elif group_id == '21':
        ls[1].append(feature)
    elif group_id == '22':
        ls[2].append(feature)
    else:
        print(f'Invalid group id {group_id}')
        
       
   

def load_utt_feat(feature_name='energy', stats='mean'):
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)         
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                continue
            
            group_id = get_group_id(npy_file.stem)
            feature = feature[feature != 0]
            if feature_name == 'f0':
                feature = feature[feature < 500.0]
                feature = np.log(feature)
                if feature.shape[0] == 0:
                    print(f'All value larger than 500.0 in {npy_file}')
                    continue
            if stats == 'mean':
                feature = np.mean(feature)
            elif stats == 'std':
                feature = np.std(feature)
            add2list(group_id, (npy_file.stem, feature), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]


        
    return all3, exp2lists
    

def load_rp():
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_list = os.path.join('/home/yzhong/gits/TurnTakingPD/filelists', 'clean_id_responsetime_' + folder + '_filtered.txt')
        
        cnt = 0
        with open(feature_list, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                rp = float(line.strip().split('\t')[-1])
                basename = line.strip().split('\t')[0]
                group_id = get_group_id(basename)
                add2list(group_id, (basename, rp), exp2lists[folder])
                cnt += 1
                
        print(f'Processing {feature_list} ...')
        print(f'Found {cnt}')


    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
    
    return all3, exp2lists

# all3_f0, exp2list_f0 = load_utt_feat(feature_name='f0')
# all3_f0_var, exp2list_f0_var = load_utt_feat(feature_name='f0', stats='std')

# all3_energy, exp2list_energy = load_utt_feat(feature_name='energy')
# all3_energy_var, exp2list_energy_var = load_utt_feat(feature_name='energy', stats='std')

# all3_rp, exp2list_rp = load_rp()
        


In [36]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm
import pandas as pd


def load_frame_feat(feature_name='energy', stats='mean', log_value=False):
        # 3 sublists for YA OA PD
    all_data = []

    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for exp_idx, folder in enumerate(['PictureNaming', 'EarlyLate', 'BoundaryTone']):
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)         
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                continue
            
            subject_id, group_id, demo_data = get_demo(npy_file.stem)
            if subject_id is None:
                continue
            feature = feature[feature != 0]
            if feature_name == 'f0':
                feature = feature[feature < 500.0]
                if feature.shape[0] == 0:
                    print(f'All value larger than 500.0 in {npy_file}')
                    continue

            if log_value is True:              
                feature = np.log(feature)


            item = {
                    'experiment': 'exp_' + str(exp_idx + 1) + '_' + folder,
                    'group_id': group_id,
                    'value': feature,
                    'subject_id':subject_id,
                    'filename': npy_file.stem,
                    'age': demo_data[0],
                    'gender': demo_data[1],
                    'moca': demo_data[2],
                    'education': demo_data[3],
                }
            all_data.append(item)

            # if stats == 'mean':
            #     feature = np.mean(feature)
            # elif stats == 'std':
            #     feature = np.std(feature)
            
            # add2list(group_id, (npy_file.stem, feature), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    # all3 = [[], [], []] 
    # for i in range(3):
    #     all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

        
    return all_data
    

metadata = load_frame_feat(feature_name='f0')
print(metadata[0])

Processing PictureNaming folder...
Found 1652 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2112_rups.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2202_schatkist.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2127_rups.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2120_kaars.png_1.wav_f0.npy
0 files with all 0 values
Processing EarlyLate folder...
Found 3971 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2112_41_L_gereedschap_tanden.wav_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2120_17_E_piept_kn

In [89]:
from scipy.stats import shapiro, kruskal, kstest, norm, levene
import pandas as pd


def basic_stats(group1, group2, groupnames=['OA(HC)', 'PD']):

    for i, group in enumerate([group1, group2]):
        # print(f'\n  stats of {groupnames[0]}')
        # # merge all data in subgroup
        # print(group.shape)
        # check if nan in subgroup
        if np.isnan(group).any():
            print('nan in subgroup')

        yield np.mean(group), np.std(group), np.median(group)
        


def stats_test(group1, group2, groupnames=['OA(HC)', 'PD']):


    d_stat, p_ks1 = kstest(group1, 'norm')
    # print(f"{groupnames[0]} kstest p-value: {p_ks1}")
 
    d_stat, p_ks2 = kstest(group2, 'norm')
    # print(f"{groupnames[1]} kstest p-value: {p_ks2}")
 
    stat, p_kw = kruskal(group1, group2)
    # print(f"Kruskal-Wallis test p value between group {groupnames[0]} and {groupnames[1]} : {p_kw}")

  
    levene_stat, p_lev = levene(group1, group2, center='median')
    # print(f"Levene test p value between group {groupnames[0]} and {groupnames[1]} : {p_lev}")

    return p_ks1, p_ks2, p_kw, p_lev

np.set_printoptions(precision=2)

df = pd.DataFrame(metadata)

# change value to mean of the array
# delete all item in dataframe with group id equal to 11
df = df[df['group_id'] != '11']


def all_level_analysis(df):

    item = {}
    heads = ['level', 'OA_values', 'PD_values', 'OA_mean', 'PD_mean', 'OA_std', 'PD_std', 'OA_median', 'PD_median', 'p_ks1', 'p_ks2', 'p_kw', 'p_lev']
    result_datas = []
    
    # frame level analysis
    OA_values = np.concatenate(df[df['group_id'] == '21']['value'].values)
    PD_values = np.concatenate(df[df['group_id'] == '22']['value'].values)
    print(len(OA_values), len(PD_values))
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['frame', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())

    # utterance level
    # mean
    df_utt_mean = df.copy()
    df_utt_mean['value'] = df['value'].apply(lambda x: np.mean(x))
    OA_values = np.array(df_utt_mean[df_utt_mean['group_id'] == '21']['value'].values)
    PD_values = np.array(df_utt_mean[df_utt_mean['group_id'] == '22']['value'].values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['utterance_mean', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())
    

    # std
    df_utt_std = df.copy()
    df_utt_std['value'] = df['value'].apply(lambda x: np.std(x))
    OA_values = np.array(df_utt_std[df_utt_std['group_id'] == '21']['value'].values)
    PD_values = np.array(df_utt_std[df_utt_std['group_id'] == '22']['value'].values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['utterance_std', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())

    # person level
    # mean of mean of all utterances
    df_person_mean_mean = df.copy()
    df_person_mean_mean['value'] = df['value'].apply(lambda x: np.mean(x))
    OA_values = np.array(df_person_mean_mean[df_person_mean_mean['group_id'] == '21'].groupby('subject_id')['value'].mean().values)
    PD_values = np.array(df_person_mean_mean[df_person_mean_mean['group_id'] == '22'].groupby('subject_id')['value'].mean().values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['person_mean_mean', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())
    
    # std of mean of all utterances
    df_person_std_mean= df.copy()
    df_person_std_mean['value'] = df['value'].apply(lambda x: np.mean(x))
    OA_values = np.array(df_person_std_mean[df_person_std_mean['group_id'] == '21'].groupby('subject_id')['value'].std().values)
    PD_values = np.array(df_person_std_mean[df_person_std_mean['group_id'] == '22'].groupby('subject_id')['value'].std().values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['person_std_mean', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())

    # mean of std of all utterances
    df_person_mean_std = df.copy()
    df_person_mean_std['value'] = df['value'].apply(lambda x: np.std(x))
    OA_values = np.array(df_person_mean_std[df_person_mean_std['group_id'] == '21'].groupby('subject_id')['value'].mean().values)
    PD_values = np.array(df_person_mean_std[df_person_mean_std['group_id'] == '22'].groupby('subject_id')['value'].mean().values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['person_mean_std', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())
    
    # std of std of all utterances
    df_person_std_std = df.copy()
    df_person_std_std['value'] = df['value'].apply(lambda x: np.std(x))
    OA_values = np.array(df_person_std_std[df_person_std_std['group_id'] == '21'].groupby('subject_id')['value'].std().values)
    PD_values = np.array(df_person_std_std[df_person_std_std['group_id'] == '22'].groupby('subject_id')['value'].std().values)
    OA_stats, PD_stats = basic_stats(OA_values, PD_values, groupnames=['OA', 'PD'])
    p_ks1, p_ks2, p_kw, p_lev = stats_test(OA_values, PD_values, groupnames=['OA', 'PD'])
    reses = ['person_std_std', len(OA_values), len(PD_values), OA_stats[0], PD_stats[0], OA_stats[1], PD_stats[1], OA_stats[2], PD_stats[2], p_ks1, p_ks2, p_kw, p_lev]
    # assign the values to the item
    for k, v in zip(heads, reses):
        item[k] = v
        
    result_datas.append(item.copy())
    
    return result_datas
# generate excel file to write the results

res_df_allexp = pd.DataFrame(all_level_analysis(df))
# print(res_df_allexp)

res_df_PictureNaming = pd.DataFrame(all_level_analysis(df[df['experiment'] == 'exp_1_PictureNaming']))
res_df_EarlyLate = pd.DataFrame(all_level_analysis(df[df['experiment'] == 'exp_2_EarlyLate']))
res_df_BoundaryTone = pd.DataFrame(all_level_analysis(df[df['experiment'] == 'exp_3_BoundaryTone']))

# print(res_df_PictureNaming)
# print(res_df_EarlyLate)
# print(res_df_BoundaryTone)

# write all these res to one excel file and one sheet for each experiment
with pd.ExcelWriter('frame_level_analysis.xlsx') as writer:
    res_df_allexp.to_excel(writer, sheet_name='all_exp')
    res_df_PictureNaming.to_excel(writer, sheet_name='PictureNaming')
    res_df_EarlyLate.to_excel(writer, sheet_name='EarlyLate')
    res_df_BoundaryTone.to_excel(writer, sheet_name='BoundaryTone')


341788 230872
              level  OA_values  PD_values     OA_mean     PD_mean     OA_std  \
0             frame     341788     230872  155.761030  157.979382  47.209862   
1    utterance_mean       4928       3085  151.909254  153.508686  35.380188   
2     utterance_std       4928       3085   24.368925   19.790064  12.843787   
3  person_mean_mean         33         20  152.763959  153.270252  30.845278   
4   person_std_mean         33         20   17.134302   12.733547   4.572721   
5   person_mean_std         33         20   24.434859   19.767887   5.776995   
6    person_std_std         33         20   11.193803    9.698034   2.317227   

      PD_std   OA_median   PD_median  p_ks1          p_ks2           p_kw  \
0  45.299326  150.264428  157.371053    0.0   0.000000e+00  1.078665e-156   
1  36.802806  148.071497  153.944591    0.0   0.000000e+00   4.795595e-02   
2  11.448280   22.486293   16.867276    0.0   0.000000e+00   3.923598e-76   
3  34.385554  145.395567  153.717728 

ModuleNotFoundError: No module named 'openpyxl'

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

groups = ['YA', 'OA', 'PD']

def lme(exp2list):
    all_data = []

    for exp_idx, exp in enumerate(['PictureNaming', 'EarlyLate', 'BoundaryTone']):
        experiment_data = exp2list[exp]
        for group_idx, group_array in enumerate(experiment_data):
            if group_idx == 0:
                continue
            for value in group_array:
                utt = {
                    'experiment': 'exp_' + str(exp_idx + 1) + '_' + exp,
                    'group': groups[group_idx],
                    'value': value[1],
                    'subject_id': int(value[0].split('-')[1].split('_')[0])
                }
                all_data.append(utt)

    df = pd.DataFrame(all_data)

    model = smf.mixedlm("value ~ group + experiment + group:experiment", 
                        data=df, 
                        groups="subject_id")
                        

    result = model.fit()

    print(result.summary())

lme(exp2list_f0)
# lme(exp2list_f0_var)
# lme(exp2list_energy)
# lme(exp2list_energy_var)
# lme(exp2list_rp)