In [21]:

import os
import glob
from pathlib import Path

import numpy as np
from tqdm import tqdm



base_folder_path = Path('/data/storage025/wavs_single_channel_nosil/')

# np.set_printoptions(threshold=np.inf)




def get_group_id(filename):
    filename = os.path.basename(filename)
    group_id = filename.split('_')[0][-4:-2]
    if group_id not in ['11', '21', '22']:
        raise ValueError(f"Invalid group id {group_id}")
    return group_id

def add2list(group_id, feature, ls):
    if group_id == '11':
        ls[0].append(feature)
    elif group_id == '21':
        ls[1].append(feature)
    elif group_id == '22':
        ls[2].append(feature)
    else:
        print(f'Invalid group id {group_id}')
        
       
       
def load_f0():
    feature_name = 'f0'
    
    # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    # f0_diff_list = open('f0_diff_list.txt', 'w')
    # non_f0_list = open('non_f0_list.txt', 'w')
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
    # for folder in ['BoundaryTone']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')
        

        cnt = 0
        cnt2 = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            if np.max(feature) < 60:
                print(f'max value < 60 in {npy_file}')
                cnt2 += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')

                continue
            
            group_id = get_group_id(npy_file.stem)
            add2list(group_id, feature, exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        print(f'{cnt2} files with max value < 60Hz')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    # print(np.sort(avg_diff)[-800])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.concatenate(all3[i], axis=0)
        # remove all 0 value
        print(tmp.shape)
        tmp = tmp[tmp != 0]    
        print(tmp.shape)
        # remove value > 500
        tmp = tmp[tmp < 500.0]
        print(tmp.shape)
        all3_final[i] = tmp
        
    return all3_final


def load_frame_feat(feature_name='energy'):
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    avg_diff = []
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_folder = os.path.join(base_folder_path, folder + '-features', feature_name)
        feature_folder = Path(feature_folder)
        npy_files = list(feature_folder.glob('*.npy'))
        print(f'Processing {folder} folder...')
        print(f'Found {len(npy_files)} npy files')

        cnt = 0
        for npy_file in npy_files:
            feature = np.load(npy_file)
            feature_nonorm = np.load(str(npy_file).replace('_normalized_', '_'))
            avg_diff_each = np.average(np.abs(feature - feature_nonorm))
            # if avg_diff_each > 1:
            #     f0_diff_list.write(f'{os.path.basename(npy_file)}\t{avg_diff_each}\n') 
            avg_diff.append(avg_diff_each)
            
            # check if all 0 value
            if np.max(feature) == 0 and np.min(feature) == 0:
                cnt += 1
                # non_f0_list.write(f'{os.path.basename(npy_file)}\n')
                continue
            
            group_id = get_group_id(npy_file.stem)
            add2list(group_id, feature, exp2lists[folder])
        print(f'{cnt} files with all 0 values')
        
    
    avg_diff = np.array(avg_diff)
    # print top 10 largest diff
    print(len(avg_diff))
    print('top10 max diff: ', np.sort(avg_diff)[-10:])
    print('mean diff: ', np.min(avg_diff))
    print('avg diff: ', np.average(avg_diff)) 
    # close
    # f0_diff_list.close()
    # non_f0_list.close()
    
    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]

    # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.concatenate(all3[i], axis=0)
        # remove all 0 value
        print(tmp.shape)
        tmp = tmp[tmp != 0]    
        print(tmp.shape)
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        
    return all3_final
    

def load_rp():
        # 3 sublists for YA OA PD
    exp2lists = {'BoundaryTone': [[], [], []], 'EarlyLate': [[], [], []], 'PictureNaming': [[], [], []]}
    
    for folder in ['BoundaryTone', 'EarlyLate', 'PictureNaming']:
        feature_list = os.path.join('/home/yzhong/gits/TurnTakingPD/filelists', 'clean_id_responsetime_' + folder + '_filtered.txt')
        
        cnt = 0
        with open(feature_list, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                rp = float(line.strip().split('\t')[-1])
                basename = line.strip().split('\t')[0]
                group_id = get_group_id(basename)
                add2list(group_id, rp, exp2lists[folder])
                cnt += 1
                
        print(f'Processing {feature_list} ...')
        print(f'Found {cnt}')


    all3 = [[], [], []] 
    for i in range(3):
        all3[i] += exp2lists['BoundaryTone'][i] + exp2lists['EarlyLate'][i] + exp2lists['PictureNaming'][i]
    
        # deep copy BD
    all3_final = [[], [], []]

    for i in range(3):
        tmp = np.array(all3[i])
        # remove all 0 value
        print(tmp.shape)
        tmp = tmp[tmp != 0]    
        print(tmp.shape)
        # remove value > 500
        # tmp = tmp[tmp < 500.0]
        # print(tmp.shape)
        all3_final[i] = tmp
        exp2lists['BoundaryTone'][i] = np.array(exp2lists['BoundaryTone'][i])
        exp2lists['EarlyLate'][i] = np.array(exp2lists['EarlyLate'][i])
        exp2lists['PictureNaming'][i] = np.array(exp2lists['PictureNaming'][i])
        
    
    return all3_final, exp2lists

# all3_f0 = load_f0()
# all3_energy = load_frame_feat(feature_name='energy')
all3_rp, exp2list = load_rp()
print(exp2list)

        


Processing /home/yzhong/gits/TurnTakingPD/filelists/clean_id_responsetime_BoundaryTone_filtered.txt ...
Found 2785
Processing /home/yzhong/gits/TurnTakingPD/filelists/clean_id_responsetime_EarlyLate_filtered.txt ...
Found 3971
Processing /home/yzhong/gits/TurnTakingPD/filelists/clean_id_responsetime_PictureNaming_filtered.txt ...
Found 1778
(2592,)
(2592,)
(3841,)
(3841,)
(2101,)
(2101,)
{'BoundaryTone': [array([0.98, 0.7 , 1.27, ..., 0.73, 0.57, 1.59]), array([2.46, 0.6 , 0.82, ..., 0.57, 0.57, 1.05]), array([ 1.56,  8.47,  1.11,  2.97,  1.5 ,  0.89,  0.92,  1.5 ,  1.27,
        0.66,  0.79,  1.94,  0.95,  1.08,  1.56,  1.82,  1.59,  2.3 ,
        0.95,  1.78,  1.98,  8.57,  5.72,  2.68,  1.3 ,  0.47,  0.79,
        1.4 ,  0.92,  1.62,  1.34,  1.14,  1.59,  9.78,  0.95,  0.92,
        1.18,  1.5 ,  2.58,  0.5 ,  1.14,  1.98,  1.08,  1.82,  9.18,
        0.86,  1.02,  2.04,  4.5 ,  1.24,  4.41,  1.69,  0.38,  1.34,
        1.27,  1.02,  1.24,  1.46,  0.86, 12.7 ,  1.05,  3.74,  0.66,
 

In [24]:
def basic_stats(all3_final):

    group_name = ['YA', 'OA', 'PD']

    for i, subgroup in enumerate(all3_final):
        print(f'\n  stats of {group_name[i]}')
        # merge all data in subgroup
        print(subgroup.shape)
        # calculate min, max, mean, std
        # only print with 2 decimal
              
        print('min:', f'{np.min(subgroup):.2f}')
        print('max:', f'{np.max(subgroup):.2f}')
        print('mean:', f'{np.mean(subgroup):.2f}')
        print('std:', f'{np.std(subgroup):.2f}')
        print('median:', f'{np.median(subgroup):.2f}')
        print('25 percentile:', f'{np.percentile(subgroup, 25):.2f}')
        print('75 percentile:', f'{np.percentile(subgroup, 75):.2f}')



np.set_printoptions(precision=2)
# basic_stats(all3_energy)
basic_stats(all3_rp)
print('================BoudaryTone================')
basic_stats(exp2list['BoundaryTone'])
print('================EarlyLate================')
basic_stats(exp2list['EarlyLate'])
print('================PictureNaming================')
basic_stats(exp2list['PictureNaming'])


  stats of YA
(2592,)
min: 0.00
max: 17.21
mean: 1.09
std: 0.93
median: 0.89
25 percentile: 0.66
75 percentile: 1.21

  stats of OA
(3841,)
min: 0.00
max: 16.18
mean: 1.10
std: 0.96
median: 0.89
25 percentile: 0.63
75 percentile: 1.29

  stats of PD
(2101,)
min: 0.01
max: 38.52
mean: 1.48
std: 1.97
median: 1.03
25 percentile: 0.71
75 percentile: 1.56

  stats of YA
(1360,)
min: 0.15
max: 6.49
mean: 0.97
std: 0.49
median: 0.89
25 percentile: 0.66
75 percentile: 1.14

  stats of OA
(1052,)
min: 0.15
max: 9.08
mean: 0.99
std: 0.64
median: 0.86
25 percentile: 0.63
75 percentile: 1.18

  stats of PD
(373,)
min: 0.15
max: 38.52
mean: 2.54
std: 3.86
median: 1.30
25 percentile: 0.89
75 percentile: 2.10

  stats of YA
(1106,)
min: 0.00
max: 17.21
mean: 1.25
std: 1.29
median: 0.89
25 percentile: 0.64
75 percentile: 1.32

  stats of OA
(1759,)
min: 0.00
max: 16.18
mean: 1.15
std: 1.24
median: 0.82
25 percentile: 0.46
75 percentile: 1.41

  stats of PD
(1106,)
min: 0.01
max: 14.78
mean: 1.28
std: