In [23]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm



base_folder_path = Path('/data/storage025/wavs_single_channel_normalized_nosil/')

# np.set_printoptions(threshold=np.inf)




def get_group_id(filename):
    filename = os.path.basename(filename)
    group_id = filename.split('_')[0][-4:-2]
    if group_id not in ['11', '21', '22']:
        raise ValueError(f"Invalid group id {group_id}")
    return group_id

def add2list(group_id, feature, ls):
    if group_id == '11':
        ls[0].append(feature)
    elif group_id == '21':
        ls[1].append(feature)
    elif group_id == '22':
        ls[2].append(feature)
    else:
        print(f'Invalid group id {group_id}')
        
       
       
def load_f0():
    feature_name = 'f0'
    
    # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    # f0_diff_list = open('f0_diff_list.txt', 'w')
    # non_f0_list = open('non_f0_list.txt', 'w')
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
    # for folder in ['BoundaryTone']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')
        

        cnt = 0
        cnt2 = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            if np.max(feature) < 60:
                print(f'max value < 60 in {npy_file}')
                cnt2 += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')

                continue
            
            group_id = get_group_id(npy_file.stem)
            feature = feature[feature != 0]
            feature = feature[feature < 500.0]
            if feature.shape[0] == 0:
                print(f'All value larger than 500.0 in {npy_file}')
                continue

            add2list(group_id, np.log(np.mean(feature)), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        print(f'{cnt2} files with max value < 60Hz')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    # print(np.sort(avg_diff)[-800])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.array(all3[i])
        # remove all 0 value
        tmp = tmp[tmp != 0]    
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.array(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        exp2lists['BoundaryTone'][i] = tmp

        tmp = np.array(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.array(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        exp2lists['PictureNaming'][i] = tmp
        
        
    return all3_final, exp2lists


def load_frame_feat(feature_name='energy'):
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            group_id = get_group_id(npy_file.stem)
            feature = feature[feature != 0]
            add2list(group_id, np.mean(feature), exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    print('top10 max diff: ', np.sort(avg_diff)[-10:])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.array(all3[i])
        # remove all 0 value
        tmp = tmp[tmp != 0]    
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.array(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        exp2lists['BoundaryTone'][i] = tmp

        tmp = np.array(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.array(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        exp2lists['PictureNaming'][i] = tmp
        
    return all3_final, exp2lists
    

def load_rp():
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_list = os.path.join('/home/yzhong/gits/TurnTakingPD/filelists', 'clean_id_responsetime_' + folder + '_filtered.txt')
        
        cnt = 0
        with open(feature_list, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                rp = float(line.strip().split('\t')[-1])
                basename = line.strip().split('\t')[0]
                group_id = get_group_id(basename)
                add2list(group_id, rp, exp2lists[folder])
                cnt += 1
                
        print(f'Processing {feature_list} ...')
        print(f'Found {cnt}')


    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
    
        # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.array(all3[i])
        # remove all 0 value
        tmp = tmp[tmp != 0]    
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.array(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        exp2lists['BoundaryTone'][i] = tmp

        tmp = np.array(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.array(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        exp2lists['PictureNaming'][i] = tmp
    
    return all3_final, exp2lists

all3_f0, exp2list_f0 = load_f0()
all3_energy, exp2list_energy = load_frame_feat(feature_name='energy')
all3_rp, exp2list_rp = load_rp()
# print(exp2list)

        


Processing BoundaryTone folder...
Found 5026 npy files
0 files with all 0 values
0 files with max value < 60Hz
Processing EarlyLate folder...
Found 3971 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2112_41_L_gereedschap_tanden.wav_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/EarlyLate-features/f0/subj-2120_17_E_piept_knaagdier.wav_1.wav_f0.npy
0 files with all 0 values
0 files with max value < 60Hz
Processing PictureNaming folder...
Found 1652 npy files
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2112_rups.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2202_schatkist.png_1.wav_f0.npy
All value larger than 500.0 in /data/storage025/wavs_single_channel_normalized_nosil/PictureNaming-features/f0/subj-2127_

In [24]:
def basic_stats(all3_final):

    group_name = ['YA', 'OA', 'PD']

    for i, subgroup in enumerate(all3_final):
        print(f'\n  stats of {group_name[i]}')
        # merge all data in subgroup
        print(subgroup.shape)
        # check if nan in subgroup
        if np.isnan(subgroup).any():
            print('nan in subgroup')

        # calculate min, max, mean, std
        # only print with 2 decimal
              
        # print('min:', f'{np.min(subgroup):.2f}')
        # print('max:', f'{np.max(subgroup):.2f}')
        # print('mean:', f'{np.mean(subgroup):.2f}')
        # print('std:', f'{np.std(subgroup):.2f}')
        # print('median:', f'{np.median(subgroup):.2f}')
        # print mean std median, only print value
        print(f'{np.mean(subgroup):.2f}')
        print(f'{np.std(subgroup):.2f}')
        print(f'{np.median(subgroup):.2f}')
        
        # print('25 percentile:', f'{np.percentile(subgroup, 25):.2f}')
        # print('75 percentile:', f'{np.percentile(subgroup, 75):.2f}')



np.set_printoptions(precision=2)
basic_stats(all3_f0)
# basic_stats(all3_energy)
# basic_stats(all3_rp)

exp2list = exp2list_f0
print('================EarlyLate================')
basic_stats(exp2list['EarlyLate'])

print('================BoudaryTone================')
basic_stats(exp2list['BoundaryTone'])

print('================PictureNaming================')
basic_stats(exp2list['PictureNaming'])




  stats of YA
(2466,)
5.22
0.29
5.32

  stats of OA
(5025,)
4.99
0.24
4.99

  stats of PD
(3152,)
5.00
0.25
5.02

  stats of YA
(1106,)
5.22
0.29
5.30

  stats of OA
(1757,)
5.00
0.25
4.99

  stats of PD
(1106,)
5.00
0.25
5.03

  stats of YA
(1360,)
5.23
0.29
5.32

  stats of OA
(2241,)
4.99
0.23
4.99

  stats of PD
(1425,)
4.98
0.25
4.99

  stats of YA
(0,)
nan
nan
nan

  stats of OA
(1027,)
5.00
0.24
4.99

  stats of PD
(621,)
5.03
0.25
5.07


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [4]:


from scipy.stats import shapiro, kruskal, kstest, norm, levene

groups = ['YA', 'OA', 'PD']


def stats_test(all3):
    for i, group in enumerate(all3):
        print(f"Group {groups[i]} - Mean: {np.mean(group):.2f}, Std: {np.std(group):.2f}, Median: {np.median(group):.2f}")
        # print size of each group
        print(f"Group {groups[i]} - Size: {len(group)}")

    for i, group in enumerate(all3):
        # data_mean = np.mean(group)
        # data_std = np.std(group)
        # standardized_data = (group - data_mean) / data_std
        d_stat, p_value = kstest(group, 'norm')
        print(f"kstest p-value: {p_value}")
        # stat, p = shapiro(group)
        # print(f"Group {groups[i]} - Shapiro-Wilk p-value: {p}")

    # print shape of all3
    print(all3[0].shape, all3[1].shape, all3[2].shape) 
    
    stat, p = kruskal(all3[0], all3[1])
    print(f"Kruskal-Wallis test p value between group {groups[0]} and {groups[1]} : {p}")

  
    stat, p = kruskal(all3[1], all3[2])
    print(f"Kruskal-Wallis test p value between group {groups[1]} and {groups[2]} : {p}")

    levene_stat, levene_p = levene(all3[0], all3[1], center='median')
    print(f"Levene test p value between group {groups[0]} and {groups[1]} : {levene_p}")


    levene_stat, levene_p = levene(all3[1], all3[2], center='median')
    print(f"Levene test p value between group {groups[1]} and {groups[2]} : {levene_p}")
    

    
def stats_test_exp(all3):
    groups = ['YA', 'OA', 'PD']

    for i, group in enumerate(all3):
        print(f"Group {i} - Mean: {np.mean(group):.2f}, Std: {np.std(group):.2f}, Median: {np.median(group):.2f}")


    for i, group in enumerate(all3):
        data_mean = np.mean(group)
        data_std = np.std(group)
        standardized_data = (group - data_mean) / data_std
        d_stat, p_value = kstest(standardized_data, 'norm')
        print(f"p-value: {p_value}")
        # stat, p = shapiro(group)
        # print(f"Group {groups[i]} - Shapiro-Wilk p-value: {p}")


    
    stat, p = kruskal(all3[0], all3[1])
    levene_stat, levene_p = levene(all3[0], all3[1])
    print(f"Kruskal-Wallis test p value between group {groups[0]} and {groups[1]} : {p}")
    print(f"Levene test p value between group {groups[0]} and {groups[1]} : {levene_p}")
    stat, p = kruskal(all3[1], all3[2])
    levene_stat, levene_p = levene(all3[1], all3[2])
    print(f"Kruskal-Wallis test p value between group {groups[1]} and {groups[2]} : {p}")
    print(f"Levene test p value between group {groups[1]} and {groups[2]} : {levene_p}")
    

    

print('================f0================')
stats_test(all3_f0)
# print('================energy================')
# stats_test(all3_energy)
# print('================rp================')
# stats_test(all3_rp)
for exp in ['EarlyLate', 'BoundaryTone', 'PictureNaming']:
    print(f'================{exp}================')
    print('================f0================')
    stats_test(exp2list_f0[exp])
    # print('================energy================')
    # stats_test(exp2list_energy[exp])
    # print('================rp================')
    # stats_test(exp2list_rp[exp])


Group YA - Mean: 192.33, Std: 46.48, Median: 201.32
Group YA - Size: 1106
Group OA - Mean: 152.11, Std: 36.06, Median: 147.47
Group OA - Size: 1757
Group PD - Mean: 153.87, Std: 37.88, Median: 152.47
Group PD - Size: 1106
kstest p-value: 0.0
kstest p-value: 0.0
kstest p-value: 0.0
(1106,) (1757,) (1106,)
Kruskal-Wallis test p value between group YA and OA : 1.211767437028444e-125
Kruskal-Wallis test p value between group OA and PD : 0.4430786071196968
Levene test p value between group YA and OA : 6.502577168569132e-06
Levene test p value between group OA and PD : 2.0010978225372455e-07
Group YA - Mean: 133.29, Std: 67.50, Median: 123.00
Group YA - Size: 1106
Group OA - Mean: 111.85, Std: 56.69, Median: 102.65
Group OA - Size: 1759
Group PD - Mean: 136.17, Std: 75.08, Median: 121.74
Group PD - Size: 1106
kstest p-value: 0.0
kstest p-value: 0.0
kstest p-value: 0.0
(1106,) (1759,) (1106,)
Kruskal-Wallis test p value between group YA and OA : 2.8243897224134942e-18
Kruskal-Wallis test p va

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  d_stat, p_value = kstest(group, 'norm')
  stat, p = kruskal(all3[0], all3[1])
  levene_stat, levene_p = levene(all3[0], all3[1], center='median')


In [22]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

all_data = []

for exp in ['EarlyLate', 'BoundaryTone', 'PictureNaming']:
    experiment_data = exp2list_f0[exp]
    for group_idx, group_array in enumerate(experiment_data):
        for value in group_array:
            utt = {
                'experiment': exp,
                'group': groups[group_idx],
                'value': value
            }
            all_data.append(utt)

df = pd.DataFrame(all_data)

model = smf.mixedlm("value ~ group + experiment", 
                    data=df, 
                    groups=df["group"]
                    )

result = model.fit()

print(result.summary())

                     Mixed Linear Model Regression Results
Model:                     MixedLM        Dependent Variable:        value      
No. Observations:          10643          Method:                    REML       
No. Groups:                3              Scale:                     1475.8213  
Min. group size:           2466           Log-Likelihood:            -53926.8884
Max. group size:           5025           Converged:                 Yes        
Mean group size:           3547.7                                               
--------------------------------------------------------------------------------
                             Coef.      Std.Err.      z   P>|z|  [0.025   0.975]
--------------------------------------------------------------------------------
Intercept                    149.995         38.423 3.904 0.000   74.688 225.302
group[T.PD]                    1.138         54.336 0.021 0.983 -105.359 107.634
group[T.YA]                   41.934         54.33