In [1]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm



base_folder_path = Path('/data/storage025/wavs_single_channel_normalized_nosil/')

# np.set_printoptions(threshold=np.inf)




def get_group_id(filename):
    filename = os.path.basename(filename)
    group_id = filename.split('_')[0][-4:-2]
    if group_id not in ['11', '21', '22']:
        raise ValueError(f"Invalid group id {group_id}")
    return group_id

def add2list(group_id, feature, ls):
    if group_id == '11':
        ls[0].append(feature)
    elif group_id == '21':
        ls[1].append(feature)
    elif group_id == '22':
        ls[2].append(feature)
    else:
        print(f'Invalid group id {group_id}')
        
       
       
def load_f0():
    feature_name = 'f0'
    
    # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    # f0_diff_list = open('f0_diff_list.txt', 'w')
    # non_f0_list = open('non_f0_list.txt', 'w')
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
    # for folder in ['BoundaryTone']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')
        

        cnt = 0
        cnt2 = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            if np.max(feature) < 60:
                print(f'max value < 60 in {npy_file}')
                cnt2 += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')

                continue
            
            group_id = get_group_id(npy_file.stem)
            add2list(group_id, feature, exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        print(f'{cnt2} files with max value < 60Hz')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    # print(np.sort(avg_diff)[-800])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.concatenate(all3[i], axis=0)
        # remove all 0 value
        # print(tmp.shape)
        tmp = tmp[tmp != 0]    
        # print(tmp.shape)
        # remove value > 500
        tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.concatenate(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        tmp = tmp[tmp < 500.0]
        exp2lists['BoundaryTone'][i] = tmp
        
        tmp = np.concatenate(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        tmp = tmp[tmp < 500.0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.concatenate(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        tmp = tmp[tmp < 500.0]
        exp2lists['PictureNaming'][i] = tmp
        
    return all3_final, exp2lists


def load_frame_feat(feature_name='energy'):
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            group_id = get_group_id(npy_file.stem)
            add2list(group_id, feature, exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    print('top10 max diff: ', np.sort(avg_diff)[-10:])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.concatenate(all3[i], axis=0)
        # remove all 0 value
        print(tmp.shape)
        tmp = tmp[tmp != 0]    
        print(tmp.shape)
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.concatenate(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        exp2lists['BoundaryTone'][i] = tmp

        tmp = np.concatenate(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.concatenate(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        exp2lists['PictureNaming'][i] = tmp
        
    return all3_final, exp2lists
    

def load_rp():
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_list = os.path.join('/home/yzhong/gits/TurnTakingPD/filelists', 'clean_id_responsetime_' + folder + '_filtered.txt')
        
        cnt = 0
        with open(feature_list, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                rp = float(line.strip().split('\t')[-1])
                basename = line.strip().split('\t')[0]
                group_id = get_group_id(basename)
                add2list(group_id, rp, exp2lists[folder])
                cnt += 1
                
        print(f'Processing {feature_list} ...')
        print(f'Found {cnt}')


    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
    
        # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.array(all3[i])
        # remove all 0 value
        print(tmp.shape)
        tmp = tmp[tmp != 0]    
        print(tmp.shape)
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
        tmp = np.array(exp2lists['BoundaryTone'][i])
        tmp = tmp[tmp != 0]
        exp2lists['BoundaryTone'][i] = tmp

        tmp = np.array(exp2lists['EarlyLate'][i])
        tmp = tmp[tmp != 0]
        exp2lists['EarlyLate'][i] = tmp

        tmp = np.array(exp2lists['PictureNaming'][i]) if len(exp2lists['PictureNaming'][i]) > 0 else np.array([0])
        tmp = tmp[tmp != 0]
        exp2lists['PictureNaming'][i] = tmp
    
    return all3_final, exp2lists

all3_f0, exp2list_f0 = load_f0()
all3_energy, exp2list_energy = load_frame_feat(feature_name='energy')
all3_rp, exp2list_rp = load_rp()
# print(exp2list)

        


Processing BoundaryTone folder...
Found 5026 npy files
0 files with all 0 values
0 files with max value < 60Hz
Processing EarlyLate folder...
Found 3971 npy files
0 files with all 0 values
0 files with max value < 60Hz
Processing PictureNaming folder...
Found 1652 npy files
0 files with all 0 values
0 files with max value < 60Hz
10649
mean diff:  0.0
avg diff:  0.9964859048099522
Processing BoundaryTone folder...
Found 5026 npy files
0 files with all 0 values
Processing EarlyLate folder...
Found 3971 npy files
0 files with all 0 values
Processing PictureNaming folder...
Found 1652 npy files
0 files with all 0 values
10649
top10 max diff:  [418.68817 432.55255 436.08682 445.66376 448.34012 464.94125 466.5552
 474.7824  486.19662 652.58307]
mean diff:  0.32798612
avg diff:  101.67225
(132039,)
(132039,)
(316733,)
(316733,)
(240336,)
(240336,)
Processing /home/yzhong/gits/TurnTakingPD/filelists/clean_id_responsetime_BoundaryTone_filtered.txt ...
Found 5026
Processing /home/yzhong/gits/Tur

In [5]:
def basic_stats(all3_final):

    group_name = ['YA', 'OA', 'PD']

    for i, subgroup in enumerate(all3_final):
        print(f'\n  stats of {group_name[i]}')
        # merge all data in subgroup
        print(subgroup.shape)
        # calculate min, max, mean, std
        # only print with 2 decimal
              
        # print('min:', f'{np.min(subgroup):.2f}')
        # print('max:', f'{np.max(subgroup):.2f}')
        # print('mean:', f'{np.mean(subgroup):.2f}')
        # print('std:', f'{np.std(subgroup):.2f}')
        # print('median:', f'{np.median(subgroup):.2f}')
        # print mean std median, only print value
        print(f'{np.mean(subgroup):.2f}')
        print(f'{np.std(subgroup):.2f}')
        print(f'{np.median(subgroup):.2f}')
        
        # print('25 percentile:', f'{np.percentile(subgroup, 25):.2f}')
        # print('75 percentile:', f'{np.percentile(subgroup, 75):.2f}')



np.set_printoptions(precision=2)
basic_stats(all3_f0)
basic_stats(all3_energy)
basic_stats(all3_rp)

# print('================EarlyLate================')
# basic_stats(exp2list['EarlyLate'])

# print('================BoudaryTone================')
# basic_stats(exp2list['BoundaryTone'])

# print('================PictureNaming================')
# basic_stats(exp2list['PictureNaming'])




  stats of YA
(158261,)
197.03
52.27
205.27

  stats of OA
(350434,)
155.28
46.83
149.40

  stats of PD
(249388,)
155.11
46.67
153.78

  stats of YA
(132039,)
99.83
133.12
47.50

  stats of OA
(316733,)
75.78
115.46
25.61

  stats of PD
(240336,)
72.14
122.38
13.92

  stats of YA
(2466,)
1.09
0.95
0.89

  stats of OA
(5030,)
1.13
0.91
0.95

  stats of PD
(3153,)
1.36
1.68
0.99


In [6]:


from scipy.stats import shapiro, kruskal, kstest, norm, levene

def stats_test(all3):
    groups = ['YA', 'OA', 'PD']
    for i, group in enumerate(all3):
        print(f"Group {groups[i]} - Mean: {np.mean(group):.2f}, Std: {np.std(group):.2f}, Median: {np.median(group):.2f}")
        # print size of each group
        print(f"Group {groups[i]} - Size: {len(group)}")

    for i, group in enumerate(all3):
        # data_mean = np.mean(group)
        # data_std = np.std(group)
        # standardized_data = (group - data_mean) / data_std
        d_stat, p_value = kstest(group, 'norm')
        print(f"kstest p-value: {p_value}")
        # stat, p = shapiro(group)
        # print(f"Group {groups[i]} - Shapiro-Wilk p-value: {p}")

    # print shape of all3
    print(all3[0].shape, all3[1].shape, all3[2].shape) 
    
    stat, p = kruskal(all3[0], all3[1])
    print(f"Kruskal-Wallis test p value between group {groups[0]} and {groups[1]} : {p}")

  
    stat, p = kruskal(all3[1], all3[2])
    print(f"Kruskal-Wallis test p value between group {groups[1]} and {groups[2]} : {p}")

    levene_stat, levene_p = levene(all3[0], all3[1], center='median')
    print(f"Levene test p value between group {groups[0]} and {groups[1]} : {levene_p}")


    levene_stat, levene_p = levene(all3[1], all3[2], center='median')
    print(f"Levene test p value between group {groups[1]} and {groups[2]} : {levene_p}")
    

    
def stats_test_exp(all3):
    groups = ['YA', 'OA', 'PD']

    for i, group in enumerate(all3):
        print(f"Group {i} - Mean: {np.mean(group):.2f}, Std: {np.std(group):.2f}, Median: {np.median(group):.2f}")


    for i, group in enumerate(all3):
        data_mean = np.mean(group)
        data_std = np.std(group)
        standardized_data = (group - data_mean) / data_std
        d_stat, p_value = kstest(standardized_data, 'norm')
        print(f"p-value: {p_value}")
        # stat, p = shapiro(group)
        # print(f"Group {groups[i]} - Shapiro-Wilk p-value: {p}")


    
    stat, p = kruskal(all3[0], all3[1])
    levene_stat, levene_p = levene(all3[0], all3[1])
    print(f"Kruskal-Wallis test p value between group {groups[0]} and {groups[1]} : {p}")
    print(f"Levene test p value between group {groups[0]} and {groups[1]} : {levene_p}")
    stat, p = kruskal(all3[1], all3[2])
    levene_stat, levene_p = levene(all3[1], all3[2])
    print(f"Kruskal-Wallis test p value between group {groups[1]} and {groups[2]} : {p}")
    print(f"Levene test p value between group {groups[1]} and {groups[2]} : {levene_p}")
    

    

# print('================f0================')
# stats_test(all3_f0)
# print('================energy================')
# stats_test(all3_energy)
# print('================rp================')
# stats_test(all3_rp)
for exp in ['EarlyLate', 'BoundaryTone', 'PictureNaming']:
    print(f'================{exp}================')
    print('================f0================')
    stats_test(exp2list_f0[exp])
    # print('================energy================')
    # stats_test(exp2list_energy[exp])
    # print('================rp================')
    # stats_test(exp2list_rp[exp])


Group YA - Mean: 196.02, Std: 49.89, Median: 202.91
Group YA - Size: 43197
Group OA - Mean: 152.41, Std: 43.05, Median: 147.68
Group OA - Size: 70529
Group PD - Mean: 157.32, Std: 43.50, Median: 156.46
Group PD - Size: 47982
kstest p-value: 0.0
kstest p-value: 0.0
kstest p-value: 0.0
(43197,) (70529,) (47982,)
Kruskal-Wallis test p value between group YA and OA : 0.0
Kruskal-Wallis test p value between group OA and PD : 4.922888663751588e-96
Levene test p value between group YA and OA : 5.660633529818747e-69
Levene test p value between group OA and PD : 2.6815097485583106e-40
Group YA - Mean: 197.40, Std: 53.13, Median: 205.27
Group YA - Size: 115064
Group OA - Mean: 154.74, Std: 46.76, Median: 149.40
Group OA - Size: 230723
Group PD - Mean: 151.85, Std: 46.38, Median: 150.26
Group PD - Size: 168630
kstest p-value: 0.0
kstest p-value: 0.0
kstest p-value: 0.0
(115064,) (230723,) (168630,)
Kruskal-Wallis test p value between group YA and OA : 0.0
Kruskal-Wallis test p value between group

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  d_stat, p_value = kstest(group, 'norm')
  stat, p = kruskal(all3[0], all3[1])
  levene_stat, levene_p = levene(all3[0], all3[1], center='median')
