In [1]:
import pandas as pd
import numpy as np
import glob, os, librosa, soundfile, json

import librosa.display
from multiprocessing import Pool

from utils.util_process_ import *

import matplotlib.pyplot as plt
from IPython.display import Audio

In [2]:
params = {
    'train_strong': './metadata/strong/audioset_train_strong.tsv',
    'eval_strong': './metadata/strong/audioset_eval_strong.tsv',
    'label_tsv': './metadata/strong/mid_to_display_name.tsv'
}

In [3]:
label_data = pd.read_csv(params['label_tsv'], delimiter='\t', header=None)
label_data.rename({0: 'label', 1: 'class'}, axis=1, inplace=True)
label_data['class'] = label_data['class'].apply(lambda x: x.replace(' ', '-'))
print(label_data.shape)
label_data.head(1)

(456, 2)


Unnamed: 0,label,class
0,/g/11b630rrvh,Kettle-whistle


In [4]:
classes = label_data['class'].unique()
classes = np.sort(classes)
class2label = {class_name: class_label for class_label, class_name in enumerate(classes)}

In [5]:
train_data = pd.read_csv(params['train_strong'], delimiter='\t')
train_data = pd.merge(train_data, label_data, on='label')
train_data['segment_id'] = train_data['segment_id'].apply(lambda x: '_'.join(x.split('_')[:-1]))
print(train_data.shape)
train_data.head(3)

(934821, 5)


Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,class
0,b0RFKhbpFJA,0.0,10.0,/m/03m9d0z,Wind
1,ZvAdd4Jb1xA,0.0,10.0,/m/03m9d0z,Wind
2,sWdJR9dInhw,0.0,9.803,/m/03m9d0z,Wind


In [6]:
SPLIT_TIME_LENGTH = 1.0 # second

EVAL_MODE = False
global_metadata = None

DATA_DIR = './data/audioset/audios/eval_segments'
SAVE_DIR = '/media/ubuntu/HD/Data/Audioset-Seg'

SAVE_LOGMEL_FLAG = False
SAVE_WAV_FLAG = False

params = {
    'resampling_rate': 32000,
    'n_fft': 1024,
    'hop_length': 320,
    'win_length': 1024,
    'lower_hertz': 50,
    'upper_hertz': 14000,
    'mel_bins': 64
}

In [7]:
import heapq

class PriorityQueue:
    def __init__(self):
        self._queue = []
        self._index = 0

    def push(self, item, priority):
        heapq.heappush(self._queue, (priority, self._index, item))
        self._index += 1

    def pop(self):
        return heapq.heappop(self._queue)[-1]
    
    def top(self):
        return self._queue[0]

    def is_empty(self):
        return len(self._queue) == 0

# 创建一个优先队列
pq = PriorityQueue()

# 添加元素到队列，并指定优先级
pq.push("music", 0.032)
pq.push("dog", 1.023)
pq.push("speak", 0.0123)
pq.push("dog", 0.832)
pq.push("bark", 0.6502)
pq.push("bark", 0.1202)

# 从队列中取出元素
while not pq.is_empty():
    print(pq.top(), end=', ')
    item = pq.pop()
    print(item)

(0.0123, 2, 'speak'), speak
(0.032, 0, 'music'), music
(0.1202, 5, 'bark'), bark
(0.6502, 4, 'bark'), bark
(0.832, 3, 'dog'), dog
(1.023, 1, 'dog'), dog


In [15]:
def valid_boundary(end_thre, start_time, end_time):
    if end_thre <= start_time: return False
    time_length = end_time - start_time
    flag1 = ((end_thre - start_time) / time_length) > 0.3
    flag2 = (end_thre - start_time) > 0.3
    # print(end_thre, start_time, end_time, flag1, flag2, flag1 | flag2)
    return flag1 | flag2

def get_unique_classes(data, axis):
    result = [d[axis] for d in data]
    result = np.unique(np.array(result)).tolist()
    return [class2label[class_name] for class_name in result]

def split_segment_data(wav_file):
    global global_metadata
    global EVAL_MODE

    data_frame = {"segments": [], "wav_id": [], 'start_time': [],
                  'end_time': [], 'classes': []}

    if EVAL_MODE: 
        save_dir = os.path.join(SAVE_DIR, 'data_e_v2')
    else:
        save_dir = os.path.join(SAVE_DIR, 'data_v2')
    os.makedirs(save_dir, exist_ok=True)
    segment_ids = global_metadata['segment_id'].unique()

    wav_id = os.path.splitext(os.path.basename(wav_file))[0][1:]
    if wav_id not in segment_ids: return

    try:
        wav_data, sr = librosa.load(wav_file, mono=False)
        wav_data = librosa.resample(wav_data, orig_sr=sr, target_sr=params['resampling_rate'])

        wav_meta = global_metadata[global_metadata['segment_id'] == wav_id].copy()
        if wav_meta.shape[0] == 0: return
        patient_dir = wav_file.split('/')[-2].split('_')[-1]

        # num_sample_point = wav_data.shape[0]

        save_png_file = os.path.join(
                save_dir, patient_dir, wav_id, f'{wav_id}.tiff')
        os.makedirs(os.path.dirname(save_png_file), exist_ok=True)
        if SAVE_LOGMEL_FLAG:
            melspec = spectrogram(data=wav_data,
                                n_fft=params['n_fft'], 
                                hop_length=params['hop_length'], 
                                win_length=params['win_length'],
                                window='hann',
                                center=True,
                                pad_mode='reflect')
            logmel = logmel_spectrogram(data=melspec,
                                        sr=params['resampling_rate'],
                                        n_fft=params['n_fft'], 
                                        n_mels=params['mel_bins'],
                                        fmin=params['lower_hertz'],
                                        fmax=params['upper_hertz'])
            save_tiff(save_png_file, logmel)

        pq = PriorityQueue()
        wav_meta = wav_meta.sort_values(by='start_time_seconds')
        max_time = wav_meta['end_time_seconds'].max()
        meta_index, new_wav_id = 0, 0

        start_time_list = wav_meta['start_time_seconds'].tolist()
        end_time_list = wav_meta['end_time_seconds'].tolist()
        class_name_list = wav_meta['class'].tolist()

        for start_time in np.arange(0, max_time, SPLIT_TIME_LENGTH):
            end_time = start_time + SPLIT_TIME_LENGTH
        
            # update: delete invalid data
            while True:
                if pq.is_empty(): break
                min_end_time = pq.top()[0]
                if min_end_time <= start_time:
                    pq.pop()
                else:
                    break

            # update: add valid adata
            while True:
                if meta_index < len(start_time_list) and valid_boundary(end_time, 
                    start_time_list[meta_index], end_time_list[meta_index]): 
                    pq.push(class_name_list[meta_index], end_time_list[meta_index])
                    meta_index += 1
                else:
                    break
                
            class_name = get_unique_classes(pq._queue, axis=-1)
            if len(class_name) == 0: continue

            start_sample_point = int(start_time * params['resampling_rate'])
            end_sample_point = int(end_time * params['resampling_rate'])
            sub_wav = wav_data[start_sample_point:end_sample_point]

            num_smaple_point_ = sub_wav.shape[0]
            num_block_point_  = SPLIT_TIME_LENGTH * params['resampling_rate']
            if num_smaple_point_ < num_block_point_:
                sub_wav = np.pad(sub_wav, (0, int(num_block_point_ - num_smaple_point_)))

            save_wav_file = os.path.join(
                save_dir, patient_dir, wav_id, f'{wav_id}_{new_wav_id}.wav')
            # os.makedirs(os.path.dirname(save_wav_file), exist_ok=True)
            if SAVE_WAV_FLAG:
                soundfile.write(save_wav_file, sub_wav, params['resampling_rate'])

            sub_melspec = spectrogram(data=sub_wav,
                            n_fft=params['n_fft'], 
                            hop_length=params['hop_length'], 
                            win_length=params['win_length'],
                            window='hann',
                            center=True,
                            pad_mode='reflect')
            sub_logmel = logmel_spectrogram(data=sub_melspec,
                                        sr=params['resampling_rate'],
                                        n_fft=params['n_fft'], 
                                        n_mels=params['mel_bins'],
                                        fmin=params['lower_hertz'],
                                        fmax=params['upper_hertz'])
            save_png_file = os.path.join(
                save_dir, patient_dir, wav_id, f'{wav_id}_{new_wav_id}.tiff')
            save_tiff(save_png_file, sub_logmel)

            data_frame['segments'].append(patient_dir)
            data_frame['wav_id'].append(f'{wav_id}_{new_wav_id}')
            data_frame['start_time'].append(start_time)
            data_frame['end_time'].append(end_time)
            data_frame['classes'].append(class_name)
            new_wav_id += 1
            # print(start_time, end_time, class_name)
    except Exception as e:
        print(wav_file, e)

    return pd.DataFrame(data_frame)

In [16]:
global_metadata = train_data.copy()
wav_dirs = glob.glob('./data/audioset/audios/unbalanced_train_segments/*')

# for wav_dir in wav_dirs:
# wav_files = glob.glob(f'{wav_dirs[2]}/*.wav')
wav_files = glob.glob('./data/audioset/audios/unbalanced_train_segments/unbalanced_train_segments_part15/*')
with Pool(4) as pool:
    result_dataframes = pool.map(split_segment_data, wav_files[:1024])

In [17]:
final_dataframe = pd.concat(result_dataframes, ignore_index=True)
print(final_dataframe.shape)
final_dataframe.head()

(415, 5)


Unnamed: 0,segments,wav_id,start_time,end_time,classes
0,part15,Gl00nDatuL0_0,0.0,1.0,[243]
1,part15,Gl00nDatuL0_1,1.0,2.0,"[239, 243]"
2,part15,Gl00nDatuL0_2,2.0,3.0,"[239, 243]"
3,part15,Gl00nDatuL0_3,3.0,4.0,"[239, 243]"
4,part15,Gl00nDatuL0_4,4.0,5.0,[243]


In [18]:
metadata_file = os.path.join(SAVE_DIR, 'metadata', 'train_strong_v2.csv')
os.makedirs(os.path.dirname(metadata_file), exist_ok=True)
final_dataframe.to_csv(metadata_file, index=False)

class2label_file = os.path.join(SAVE_DIR, 'metadata', 'class2label.json')
with open(class2label_file, 'w') as f:
    f.write(json.dumps(class2label))

In [12]:
len(final_dataframe['wav_id'].apply(lambda x : '_'.join(x.split('_')[:-1])).unique())

45

In [13]:
# global_metadata = train_data.copy()
# wav_dirs = glob.glob('./data/audioset/audios/unbalanced_train_segments/*')

# wav_files = glob.glob(os.path.join(wav_dirs[2], '*'))
# data = split_segment_data(wav_files[0])

In [14]:
# data.head(10)