In [17]:
import os
import json
import numpy as np
import tensorflow as tf
import soundfile as sf
import resampy
import sys
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import argparse
from tensorflow.keras.utils import Sequence
import time

sys.path.append('/home/ysr/project/ai/deep_learning')

from random import shuffle
#import features.audio_features as features_lib
import feature.audio_features as features_lib
from utils.params import Params
import utils.tools as tools
#import utils.params as ynet_params



In [30]:
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
SILENCE_LABEL = '_Silence_'
UNKNOWN_WORD_LABEL = '_Unknown_'

#设置行不限制数量
pd.set_option('display.max_rows',None)
#最后的的参数可以限制输出行的数量

#设置列不限制数量
pd.set_option('display.max_columns',None)
#最后的的参数可以限制输出列的数量

#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',1000)

def log_mel_spectrogram(audio_file, param):
    wav_data, sr = sf.read(audio_file, dtype=np.int16)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

    # 补0 to do  mask
    if len(wav_data) < param.sample_rate:
        wav_data = np.pad(wav_data, (0, int(param.sample_rate - len(wav_data))), 'constant', constant_values = 0)

    waveform = wav_data / 32768.0

    if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)  # 多通道转单通道

    if sr != param.sample_rate:
            waveform = resampy.resample(waveform, sr, param.sample_rate)

    waveform = np.reshape(waveform, [1, -1]).astype(np.float32)
    return features_lib.waveform_to_log_mel_spectrogram_patches(tf.squeeze(waveform, axis=0), param)

def get_files_and_labels(train_dir, file_type = 'wav', train_split = 0.9, wanted_label = None):
    #ignored = {"folder_one", "folder_two", "folder_three"}
    #folders = [x for x in os.listdir(path) if x not in ignored]
    
    if not wanted_label:
        classes = sorted(os.listdir(train_dir))
    else :
        classes = [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_label.split(',') 
    files_train = list()
    files_val = list()
    labels = dict()
 
    for cnt, i in enumerate(classes): # loop over classes
        tmp = os.listdir(train_dir + i)
        shuffle(tmp)
        for j in tmp[:round(len(tmp)*train_split)]: # loop over training samples
            if j.split('.')[-1] == file_type:
                files_train.append(train_dir + i +'/' + j)
        for j in tmp[round(len(tmp)*train_split):]: # loop over validation samples
            if j.split('.')[-1] == file_type:
                files_val.append(train_dir + i +'/' + j)
        labels[i] = cnt
    return files_train, files_val, labels


def _parse_audio_function(example_string):
    n_classes = 527
    feature = { 
        'patches': tf.io.FixedLenFeature([], tf.string),
        'patches_shape': tf.io.FixedLenFeature(shape=(3,), dtype=tf.int64), # shape = 3 
        'label': tf.io.FixedLenFeature([n_classes], dtype=tf.int64), 
    }       
    feature_dict = tf.io.parse_single_example(example_string, feature)
    patches_raw = feature_dict['patches']   
    patches_shape = feature_dict['patches_shape']
    label = feature_dict['label']

    patches = tf.io.decode_raw(patches_raw, tf.float32)
    patches = tf.reshape(patches, patches_shape)
    label = tf.reshape(label, (1, n_classes)) 
    return patches, label


def audio_example(patches, label): # tf record example
    feature = { 
        'patches': tools._numpy_float32_feature(patches),
        'patches_shape': tools._shape_feature(patches.shape),
        'label': tools._int64_list_feature(label),
    }   
    return tf.train.Example(features=tf.train.Features(feature=feature)) 

def audio_file_example(patches): # tf record example
    feature = { 
        'patches': tools._bytes_feature(patches),
    }   
    return tf.train.Example(features=tf.train.Features(feature=feature)) 

class ESC50DataSet():
    def __init__(self):
        print('')

    def build_dataset():
        print('')

class AudioSetDataSet():
    def __init__(self, params, path, cache_dir = None):
        self.params = params
        self.path = path
        self.cache_dir = cache_dir

    def __read_pd(self, path):
        valid_meta = pd.read_csv(path + 'valid.csv')
        valid_id = valid_meta.groupby('YTID')['YTID'].apply(lambda cat: cat.sample(1)).reset_index()['YTID']
        train_meta = pd.read_csv(path+ 'train.csv')
        train_id = train_meta.groupby('YTID')['YTID'].apply(lambda cat: cat.sample(1)).reset_index()['YTID']
        class_meta = pd.read_csv(path + 'class_labels_indices.csv')
        class_id = class_meta.groupby('index')['index'].apply(lambda cat: cat.sample(1)).reset_index()['index']
        return valid_meta, valid_id, train_meta, train_id, class_meta, class_id
        
    def __build_classes(self, class_meta, class_id):
        labels = []
        label_index = {}
        for index in range(len(class_id)):
            item = []
            clas = class_id[index]
            clas = class_meta[class_meta.index == clas]
            label_index[clas.mid.to_string(index=False)] = clas.index[0]
            item.append(clas.index[0])
            labels.append(item)

        mlb = MultiLabelBinarizer()
        mlb.fit(labels)
        print(labels)
        return label_index, mlb, mlb.classes_
    

                    
    def __build_cache_np(self, wav_dir, npy_dir, cache_path, file_ytid, meta,  mlb, label_index, params):
        total_size = len(file_ytid)
        with tf.io.TFRecordWriter(cache_path) as writer:
            for index in range(total_size):
                label_id = []
                item = file_ytid[index]
                item = meta[meta.YTID == item]
                
                multi_label = item.positive_labels.to_string(index=False).split(",")
                for label in multi_label:
                    label_id.append(label_index[label])

                wav_path = wav_dir + item.YTID.to_string(index=False) + '.wav'
                patches_npy_path = npy_dir + 'patches/' + item.YTID.to_string(index=False) + '.npy'
                label_npy_path = npy_dir + 'label/' + item.YTID.to_string(index=False) + '.npy'
                
                spectrogram, patches = log_mel_spectrogram(wav_path, params)
                np.save(patches_npy_path, patches)
                np.save(label_npy_path, mlb.transform([label_id])[0])
                
                d = f"Scanning '{wav_path}' audio and labels... {total_size} found, {index} corrupted"
                tqdm(None, desc=d, total=total_size, initial=index)  # display cache results
                if index == 10:
                    break

            writer.close()

    def build_dataset(self):
        valid_meta, valid_id, train_meta, train_id, class_meta, class_id = self.__read_pd(self.path)
        label_index, mlb, classes = self.__build_classes(class_meta, class_id)
        self.__build_cache_np(self.path + 'train_wav/', self.path + 'train_np/', self.path + 'train.cache', train_id, train_meta, mlb, 
                                    label_index, self.params)
        self.__build_cache_np(self.path + 'valid_wav/', self.path + 'valid_np/', self.path + 'valid.cache', valid_id, valid_meta, mlb, 
                                    label_index, self.params)


In [31]:
param = Params()
dataset = AudioSetDataSet(param, '/home/ysr/dataset/audio/audioset/', './')
dataset.build_dataset()

[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], [151], [152], [153], [154], [155], [156], [157], [15

Scanning '/home/ysr/dataset/audio/audioset/train_wav/--PJHxphWEs.wav' audio and labels... 19644 found, 0 corrupted:   0%|          | 0/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav/--aE2O5G5WE.wav' audio and labels... 19644 found, 1 corrupted:   0%|          | 1/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav/--aaILOrkII.wav' audio and labels... 19644 found, 2 corrupted:   0%|          | 2/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav/--cB2ZVjpnA.wav' audio and labels... 19644 found, 3 corrupted:   0%|          | 3/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav/--ekDLDTUXA.wav' audio and labels... 19644 found, 4 corrupted:   0%|          | 4/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav/-0DLPzsiXXE.wav' audio and labels... 19644 found, 5 corrupted:   0%|          | 5/19644 [00:00<?, ?it/s]
Scanning '/home/ysr/dataset/audio/audioset/train_wav

In [45]:
def get_files_and_labels(train_dir, typ='npy', train_split=0.9):  
    files_train = list()
    files_val = list()
    label_train = list()
    label_val = list()
    tmp = os.listdir(train_dir + 'label/')
    shuffle(tmp)
    
    for j in tmp[:round(len(tmp)*train_split)]: # loop over training samples
        if j.split('.')[-1] == typ:
            files_train.append(train_dir  +'patches/' + j)
            label_train.append(train_dir  +'label/' + j)
    for j in tmp[round(len(tmp)*train_split):]: # loop over validation samples
        if j.split('.')[-1] == typ:
            files_val.append(train_dir +'patches/' + j)
            label_val.append(train_dir  +'label/' + j)
            
    return files_train, files_val, label_train, label_val

In [42]:
get_files_and_labels('/home/ysr/dataset/audio/audioset/train_np/')

['/home/ysr/dataset/audio/audioset/train_np/patches/--cB2ZVjpnA.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-0SdAVK79lg.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/--aaILOrkII.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-11LhdJgBb8.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/--aE2O5G5WE.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-0mjrMposBM.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-0DdlOuIFUI.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-0O3e95y4gE.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/-0DLPzsiXXE.npy', '/home/ysr/dataset/audio/audioset/train_np/patches/--ekDLDTUXA.npy']
['/home/ysr/dataset/audio/audioset/train_np/patches/--PJHxphWEs.npy']
['/home/ysr/dataset/audio/audioset/train_np/label/--cB2ZVjpnA.npy', '/home/ysr/dataset/audio/audioset/train_np/label/-0SdAVK79lg.npy', '/home/ysr/dataset/audio/audioset/train_np/label/--aaILOrkII.npy', '/home/ysr/dataset/audio/audioset/tra

In [34]:
tmp = os.listdir('/home/ysr/dataset/audio/audioset/train_np/labels/')

['-0DLPzsiXXE.npy', '--ekDLDTUXA.npy', '--aE2O5G5WE.npy', '--PJHxphWEs.npy', '-11LhdJgBb8.npy', '-0O3e95y4gE.npy', '--cB2ZVjpnA.npy', '-0SdAVK79lg.npy', '-0mjrMposBM.npy', '-0DdlOuIFUI.npy', '--aaILOrkII.npy']


In [54]:
class DataGenerator(Sequence):
    
    'Generates YAMNet patches'
    def __init__(self, 
                 file_list,
                 label_list,
                 dim = (96, 64),
                 batch_size = 1, 
                 n_classes = 3,
                 shuffle = True):
        
        'Initialization'
        self.batch_size = batch_size
        self.dim = dim
        self.file_list = file_list
        self.label_list = label_list
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.label_list) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        label_list_temp = [self.label_list[k] for k in indexes]
        file_list_temp = [self.file_list[k] for k in indexes]
        
        x, y = self.__data_generation(file_list_temp, label_list_temp)
        return x, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.label_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, file_list_temp, label_list_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization   
        return np.load(file_list_temp[0]), np.load(label_list_temp[0])
  

In [56]:
files_train, files_val, label_train, label_val = get_files_and_labels('/home/ysr/dataset/audio/audioset/train_np/')
datagen = DataGenerator(files_train, label_train)

datagen.__getitem__(0)
datagen.__len__()

10

In [None]:
      #X = np.empty((self.batch_size, *self.dim))
        X = []
        y = np.empty((self.batch_size, self.n_classes))
        z = np.empty((self.batch_size, 521))
        sample_weights = np.empty((self.batch_size, ))
        
        y[:] = 0
        #print(list_IDs_temp)
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            #print(ID)            
            class_id = ID.split('/')[-2]
            #print(class_id)
            y[i,self.labels[class_id]] = 1
            #print(y)          
            sample = np.load(ID)
            
            # if the waveform for this sample was long enough to contain multiple patches, randomly select one of the patches
            #if sample.shape[0] > 1:
                #sample = np.squeeze(sample[0])
             #   sample = np.squeeze(sample[np.random.choice(range(sample.shape[0]), 1)])
                
            X.append(sample)
            #X[i,] = sample
                
            if self.class_weights:
                sample_weights[i] = self.class_weights[self.labels[class_id]]
          
        self.classes.append(y.reshape(y.shape[1]))
        
        if self.class_weights is not None:
            return X, y, sample_weights
        else:
            if self.tflite_ouput == 1:
                return X, y
            else:
                return X, [y,z]