In [1]:
import pandas as pd
import numpy as np
import glob, os, librosa, soundfile, json, h5py, logging

import librosa.display
from multiprocessing import Pool

from utils.util_process import *

import matplotlib.pyplot as plt
from IPython.display import Audio

In [2]:
DATA_DIR = '/media/ubuntu/HD_new/download/audioset/audios'
META_DIR = '/home/whq/projects/with-dog-audio/audioset-tidy/metadata'
SAVE_DIR = '/media/ubuntu/ssd2t/AIGroup/Audio-Data/audioset_strong'

PROCESS_MODE = 'eval'
assert PROCESS_MODE in ['train', 'eval']


In [15]:
strong_cls_df = pd.read_csv(f"{META_DIR}/strong/mid_to_display_name.tsv", delimiter='\t')
weak_cls_df = pd.read_csv(f"{META_DIR}/weak/class_labels_indices.csv")

if PROCESS_MODE == 'train':
    strong_seg_df = pd.read_csv(f"{META_DIR}/strong/audioset_train_strong.tsv", delimiter='\t')
    weak_seg_dfs = [pd.read_csv(f"{META_DIR}/weak/unbalanced_train_segments.csv", sep=", ", engine='python'),
                    pd.read_csv(f"{META_DIR}/weak/balanced_train_segments.csv", sep=", ", engine='python')]
    weak_seg_df = pd.concat(weak_seg_dfs, axis=0)
elif PROCESS_MODE == 'eval':
    strong_seg_df = pd.read_csv(f"{META_DIR}/strong/audioset_eval_strong.tsv", delimiter='\t')
    weak_seg_df = pd.read_csv(f"{META_DIR}/weak/eval_segments.csv", sep=", ", engine='python')

In [16]:
print(strong_cls_df.shape)
strong_cls_df.sort_values(by='display_name', inplace=True)
strong_cls_df['label'] = list(range(strong_cls_df.shape[0]))
strong_cls_df.head()

(456, 2)


Unnamed: 0,mid,display_name,label
241,/m/07q2z82,"Accelerating, revving, vroom",0
383,/m/0gvgw0,Air brake,1
75,/m/025wky1,Air conditioning,2
165,/m/05x_td,"Air horn, truck horn",3
397,/m/0k5j,Aircraft,4


In [17]:
classes = strong_cls_df['display_name'].unique()
class2label, mid2label = {}, {}
for class_labels in classes:
    temp_data = strong_cls_df[strong_cls_df['display_name'] == class_labels].copy()
    class2label[class_labels] = int(temp_data['label'].values[0])
    mid2label[temp_data['mid'].values[0]] = int(temp_data['label'].values[0])
with open(f'{META_DIR}/tiff_class2label.json', 'w') as f:
    f.write(json.dumps({"class2label": class2label, "mid2label": mid2label}))

In [18]:
print(weak_cls_df.shape)
weak_cls_df.head()

(527, 3)


Unnamed: 0,index,mid,display_name
0,0,/m/09x0r,Speech
1,1,/m/05zppz,"Male speech, man speaking"
2,2,/m/02zsn,"Female speech, woman speaking"
3,3,/m/0ytgt,"Child speech, kid speaking"
4,4,/m/01h8n0,Conversation


In [19]:
print(strong_seg_df.shape)
strong_seg_df['wav_id'] = strong_seg_df['segment_id'].apply(lambda x : 'Y' + '_'.join(x.split('_')[:-1]))
strong_seg_df['label_id'] = strong_seg_df['label'].apply(lambda x :mid2label[x])
strong_seg_df.head()

(139538, 4)


Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,wav_id,label_id
0,s9d-2nhuJCQ_30000,0.0,10.0,/m/04rlf,Ys9d-2nhuJCQ,254
1,s9d-2nhuJCQ_30000,2.627,7.237,/m/053hz1,Ys9d-2nhuJCQ,74
2,s9d-2nhuJCQ_30000,2.627,9.239,/m/03qtwd,Ys9d-2nhuJCQ,114
3,s9d-2nhuJCQ_30000,5.634,6.649,/m/01w250,Ys9d-2nhuJCQ,435
4,s9d-2nhuJCQ_30000,7.201,8.56,/m/0l15bq,Ys9d-2nhuJCQ,93


In [20]:
print(weak_seg_df.shape)
weak_seg_df['wav_id'] = weak_seg_df['YTID'].apply(lambda x : 'Y' + x)
weak_seg_df['label_id'] = weak_seg_df['positive_labels'].apply(
    lambda x : [mid2label[t] if t in mid2label else -1 for t in x[1:-1].split(',')])
weak_seg_df.head()

(20371, 4)


Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,wav_id,label_id
0,--4gqARaEJE,0.0,10.0,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk""",Y--4gqARaEJE,"[131, 364, 130, 10]"
1,--BfvyPmVMo,20.0,30.0,"""/m/03l9g""",Y--BfvyPmVMo,[194]
2,--U7joUcTCo,0.0,10.0,"""/m/01b_21""",Y--U7joUcTCo,[105]
3,--i-y1v8Hy8,0.0,9.0,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005""",Y--i-y1v8Hy8,"[254, 358, 156, 77]"
4,-0BIyqJj9ZU,30.0,40.0,"""/m/07rgt08,/m/07sq110,/t/dd00001""",Y-0BIyqJj9ZU,"[89, 29, 17]"


In [21]:
params = {
    'sample_rate': 32000,
    'clip_samples': 32000 * 10,
    'crop_second': 2.0,
    'croped_pad_second': 0.05,
    'input_size': (int(32000 * 2.0 / 320) + 1, 64),
    'n_fft': 1024,
    'hop_length': 320,
    'win_length': 1024,
    'lower_hertz': 50,
    'upper_hertz': 14000,
    'mel_bins': 64
}
POSITIVE_CLSSSES = ['Bark', 'Bow-wow', 'Yip']
POSITIVE_LABELS = [class2label[x] for x in POSITIVE_CLSSSES]
IGNORE_CLASSES   = ['Dog', 'Howl', 'Growling', 'Whimper (dog)']
IGNORE_LABELS = [class2label[x] for x in IGNORE_CLASSES]

In [22]:
print('Pos-Labels:', POSITIVE_LABELS)
print('Ign-Labels:', IGNORE_LABELS)

Pos-Labels: [20, 44, 452]
Ign-Labels: [130, 205, 187, 430]


In [26]:
def pad_or_truncate(x, audio_length):
    """Pad all audio to specific length."""
    if len(x) <= audio_length:
        return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
    else:
        return x[0 : audio_length]
    
def get_all_strong_wavs(mode):
    wav_ids = strong_seg_df['wav_id'].unique()
    wav_files = []
    for wav_id in tqdm(wav_ids, desc='Index file'):
        wav_file_ = glob.glob(f"{DATA_DIR}/*{mode}*/{wav_id}.wav")
        if len(wav_file_) != 1:
            wav_file_ = glob.glob(f"{DATA_DIR}/*{mode}*/*/{wav_id}.wav")
        if len(wav_file_) != 1:
            # print("Not found:", f"{DATA_DIR}/**/{wav_id}.wav")
            pass
        else:
            wav_file = wav_file_[0]
            wav_files.append(wav_file)

    print("Found files:", len(wav_files), '/', len(wav_ids))
    return wav_files

def valid_boundary(end_thre, start_time, end_time):
    if end_thre <= start_time: return False
    time_length = end_time - start_time
    if time_length < 1e-8: return False
    flag1 = ((end_thre - start_time) / time_length) > 0.5
    return flag1

def get_unique_classes(data, axis):
    result = [d[axis] for d in data]
    result = np.unique(np.array(result)).tolist()
    return result

def convert_to_seg(audio_file):
    save_dir = f"{SAVE_DIR}/wav/{PROCESS_MODE}"
    os.makedirs(save_dir, exist_ok=True)
    
    wav_id = os.path.splitext(os.path.basename(audio_file))[0]
    weak_temp_data = weak_seg_df[weak_seg_df['wav_id'] == wav_id]
    strong_temp_data = strong_seg_df[strong_seg_df['wav_id'] == wav_id]
    if weak_temp_data.shape[0] == 0 or strong_temp_data.shape[0] == 0:
        return
    # print(wav_id, audio_file, end=' == ')
    # print(weak_temp_data)
    # print(strong_temp_data)

    weak_include_pos_flag = False
    weak_include_ign_flag = False
    for i, weak_item in weak_temp_data.iterrows():
        if np.isin(weak_item['label_id'], POSITIVE_LABELS).any():
            weak_include_pos_flag = True
        if np.isin(weak_item['label_id'], IGNORE_LABELS).any():
            weak_include_ign_flag = True

    strong_include_pos_flag = False
    strong_include_ign_flag = False
    for i, strong_item in strong_temp_data.iterrows():
        if strong_item['label_id'] in POSITIVE_LABELS:
            strong_include_pos_flag = True
        if strong_item['label_id'] in IGNORE_LABELS:
            strong_include_ign_flag = True
            
    # 不存在正例强标签，并且弱标签包含忽略标签
    if (not strong_include_pos_flag) and weak_include_ign_flag: return

    data_frame = {"wav_id": [], "seg_id": [], 'start_time': [],
                  'end_time': [], 'labels': []}

    (audio, _) = librosa.core.load(audio_file, sr=params['sample_rate'], mono=True)
    # audio = pad_or_truncate(audio, params['clip_samples'])

    save_wav_dir = os.path.join(save_dir, wav_id)
    os.makedirs(save_wav_dir, exist_ok=True)

    pq = PriorityQueue()
    wav_meta = strong_temp_data.sort_values(by='start_time_seconds')
    max_time = wav_meta['end_time_seconds'].max()
    meta_index, new_wav_id = 0, 0

    start_time_list = wav_meta['start_time_seconds'].tolist()
    end_time_list = wav_meta['end_time_seconds'].tolist()
    class_label_list = wav_meta['label_id'].tolist()
    
    for start_time in np.arange(0, max_time, params['crop_second']):
        end_time = start_time + params['crop_second']
    
        # update: delete invalid data
        while True:
            if pq.is_empty(): break
            min_end_time = pq.top()[0]
            if min_end_time <= start_time:
                pq.pop()
            else:
                break

        # update: add valid adata
        while True:
            if meta_index < len(start_time_list) and valid_boundary(end_time, 
                start_time_list[meta_index], end_time_list[meta_index]): 
                pq.push(class_label_list[meta_index], end_time_list[meta_index])
                meta_index += 1
            else:
                break

        class_labels = get_unique_classes(pq._queue, axis=-1)
        if len(class_labels) == 0: continue

        if start_time - params['croped_pad_second'] < 0:
            start_time = start_time
            end_time = end_time + params['croped_pad_second'] * 2
        else:
            start_time = start_time - params['croped_pad_second']
            end_time = end_time + params['croped_pad_second']

        start_sample_point = int(start_time * params['sample_rate'])
        end_sample_point = int(end_time * params['sample_rate'])
        sub_wav = audio[start_sample_point:end_sample_point]

        num_smaple_point_ = sub_wav.shape[0]
        if num_smaple_point_ < params['sample_rate'] // 2: continue

        num_block_point_  = (params['crop_second'] + 0.1) * params['sample_rate']
        if num_smaple_point_ < num_block_point_:
            sub_wav = np.pad(sub_wav, (0, int(num_block_point_ - num_smaple_point_)))

        save_wav_file = os.path.join(save_dir, wav_id, f'{wav_id}_{new_wav_id}.wav')
        os.makedirs(os.path.dirname(save_wav_file), exist_ok=True)
        soundfile.write(save_wav_file, sub_wav, params['sample_rate'])

        data_frame['wav_id'].append(wav_id)
        data_frame['seg_id'].append(f'{wav_id}_{new_wav_id}')
        data_frame['start_time'].append(start_time)
        data_frame['end_time'].append(end_time)
        data_frame['labels'].append(class_labels)
        new_wav_id += 1
    
    return pd.DataFrame(data_frame)

def select_middle_portion(arr, desired_length):
    if len(arr) < desired_length:
        raise ValueError("Array length is less than the desired length")

    middle_index = len(arr) // 2
    half_length = desired_length // 2
    start_index = middle_index - half_length
    end_index = start_index + desired_length

    selected_portion = arr[start_index:end_index, :]

    return selected_portion

def convert_to_hdf5(save_hdf5_file, meta_data):
    save_dir = f"{SAVE_DIR}/wav/{PROCESS_MODE}"
    os.makedirs(os.path.dirname(save_hdf5_file), exist_ok=True)
    
    audios_num = meta_data.shape[0]
    with h5py.File(save_hdf5_file, 'w') as hf:
        hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
        hf.create_dataset('logmel', shape=((audios_num, *params['input_size'])), dtype=np.float32)
        hf.create_dataset('target', shape=((audios_num, )), dtype=
                                h5py.special_dtype(vlen=np.dtype('int32')))
        hf.attrs.create('sample_rate', data=params['sample_rate'], dtype=np.int32)

        # Pack waveform & target of several audio clips to a single hdf5 file
        # for n, wav_file in enumerate(wav_files):
        for n, meta_item in tqdm(meta_data.iterrows(), desc='Convert to HDF5'):
            wav_id = meta_item['wav_id']
            seg_id = meta_item['seg_id']
            target = meta_item['labels']
            
            audio_path = os.path.join(save_dir, wav_id, f'{seg_id}.wav')
            # break
            if os.path.isfile(audio_path):
                # logging.info('{} {}'.format(n, audio_path))
                (audio, _) = librosa.core.load(audio_path, sr=params['sample_rate'], mono=True)
                
                melspec = spectrogram(data=audio,
                                n_fft=params['n_fft'], 
                                hop_length=params['hop_length'], 
                                win_length=params['win_length'],
                                window='hann',
                                center=True,
                                pad_mode='reflect')
                logmel = logmel_spectrogram(data=melspec,
                                            sr=params['sample_rate'],
                                            n_fft=params['n_fft'], 
                                            n_mels=params['mel_bins'],
                                            fmin=params['lower_hertz'],
                                            fmax=params['upper_hertz'])
                logmel = select_middle_portion(logmel, params['input_size'][0])
                audio_name = os.path.basename(audio_path)
                hf['audio_name'][n] = audio_name
                hf['logmel'][n] = logmel
                hf['target'][n] = np.array(eval(target))
            else:
                logging.info('{} File does not exist! {}'.format(n, audio_path))

def convert_to_indexes(save_indexes_file, saved_data_file):
    with h5py.File(saved_data_file, 'r') as hr:
        audios_num = hr['logmel'].shape[0]
        with h5py.File(save_indexes_file, 'w') as hw:
            hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
            hw.create_dataset('target', data=hr['target'][:], 
                                dtype=h5py.special_dtype(vlen=np.dtype('int32')))
            hw.create_dataset('hdf5_path', data=[(saved_data_file).encode()
                                                    ] * audios_num, dtype='S200')
            hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)


In [27]:
wav_files = get_all_strong_wavs(PROCESS_MODE)

Index file: 100%|██████████| 16996/16996 [00:01<00:00, 13549.60it/s]

Found files: 16904 / 16996





In [28]:
final_dataframe = []
save_csv_file = f"{SAVE_DIR}/wav/{PROCESS_MODE}.csv"
with Pool(16) as pool:
    result_dataframes = pool.map(convert_to_seg, wav_files)
final_dataframe += result_dataframes
final_dataframe = pd.concat(final_dataframe, ignore_index=True)
final_dataframe.to_csv(save_csv_file, index=False)
print(final_dataframe.shape)
final_dataframe.head()

(61426, 5)


Unnamed: 0,wav_id,seg_id,start_time,end_time,labels
0,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_0,3.95,6.05,"[74, 114, 254]"
1,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_1,5.95,8.05,"[74, 93, 114, 254, 435]"
2,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_2,7.95,10.05,"[93, 114, 254, 435]"
3,YYxlGt805lTA,YYxlGt805lTA_0,3.95,6.05,"[157, 212, 254, 368]"
4,YYxlGt805lTA,YYxlGt805lTA_1,5.95,8.05,"[10, 254]"


In [29]:
final_dataframe = pd.read_csv(save_csv_file)
print(final_dataframe.shape)
final_dataframe.head()

(61426, 5)


Unnamed: 0,wav_id,seg_id,start_time,end_time,labels
0,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_0,3.95,6.05,"[74, 114, 254]"
1,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_1,5.95,8.05,"[74, 93, 114, 254, 435]"
2,Ys9d-2nhuJCQ,Ys9d-2nhuJCQ_2,7.95,10.05,"[93, 114, 254, 435]"
3,YYxlGt805lTA,YYxlGt805lTA_0,3.95,6.05,"[157, 212, 254, 368]"
4,YYxlGt805lTA,YYxlGt805lTA_1,5.95,8.05,"[10, 254]"


In [30]:
save_data_file = f"{SAVE_DIR}/logmel/{PROCESS_MODE}_data.h5"
convert_to_hdf5(save_data_file, final_dataframe)

Convert to HDF5: 61426it [05:44, 178.07it/s]


In [31]:
save_index_file = f"{SAVE_DIR}/logmel/{PROCESS_MODE}_index.h5"
convert_to_indexes(save_index_file, save_data_file)