In [1]:
import h5py, glob, tifffile, os, random
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = '/media/ubuntu/HD/Data/Audioset-Seg/data_cut_logmel'
META_FILE = '/media/ubuntu/HD/Data/Audioset-Seg/metadata/train_strong_2s.csv'

In [3]:
metadata = pd.read_csv(META_FILE)
print(metadata.shape)
metadata.head()

(470086, 5)


Unnamed: 0,segments,wav_id,start_time,end_time,classes
0,part32,jj-39Vkt8jo_0,0.0,2.0,"[78, 103, 243, 316]"
1,part32,jj-39Vkt8jo_1,2.0,4.0,"[78, 103, 178, 212, 243, 337, 377]"
2,part32,jj-39Vkt8jo_2,4.0,6.0,"[178, 212, 243, 337, 377]"
3,part32,jj-39Vkt8jo_3,6.0,8.0,"[178, 212, 243, 377]"
4,part32,jj-39Vkt8jo_4,8.0,10.0,"[178, 243, 353]"


In [4]:
metadata['wav_id_'] = metadata['wav_id'].apply(lambda x: '_'.join(x.split('_')[:-1]))
wav_ids = metadata['wav_id_'].unique()
print(len(wav_ids), wav_ids[:4])

95930 ['jj-39Vkt8jo' 'jTLxRcopOCM' 'jTbNcMezsZw' 'ija4BoG60-c']


In [5]:
# wav_feature = {"wav_id": [], "wav_feature": []}

# for wav_id in wav_ids:
#     temp_data = metadata[metadata['wav_id_'] == wav_id].copy()
#     feature = []
#     for classes in temp_data['classes'].tolist():
#         feature += eval(classes)
#     feature = np.unique(np.array(feature)).tolist()
#     feature = '-'.join([str(x) for x in feature])
#     # break
#     wav_feature['wav_id'].append(wav_id)
#     wav_feature['wav_feature'].append(feature)

# wav_feature = pd.DataFrame(wav_feature)
# print(wav_feature.shape)
# wav_feature.head()

In [6]:
random.seed(42)
random.shuffle(wav_ids)
print(len(wav_ids), wav_ids[:4])

95930 ['lCrFasUkKMY' '6MFdiV_8wIM' 'F3jA74HhBnE' 'izYM7k5w6t4']


In [7]:
meta_kfold = {"wav_id_": wav_ids, "kfold": []}

kfold = 10
meta_kfold['kfold'] = [x % kfold for x in range(len(wav_ids))]

meta_kfold = pd.DataFrame(meta_kfold)
# meta_kfold.to_csv(f"{os.path.dirname(META_FILE)}/cross_valid_.csv", index=False)

print(meta_kfold.shape)
meta_kfold.head()

(95930, 2)


Unnamed: 0,wav_id_,kfold
0,lCrFasUkKMY,0
1,6MFdiV_8wIM,1
2,F3jA74HhBnE,2
3,izYM7k5w6t4,3
4,zagWLobkTm0,4


In [8]:
metadata = pd.merge(metadata, meta_kfold, on='wav_id_')
metadata.drop('wav_id_', axis=1, inplace=True)

print(metadata.shape)
metadata.head()

(470086, 6)


Unnamed: 0,segments,wav_id,start_time,end_time,classes,kfold
0,part32,jj-39Vkt8jo_0,0.0,2.0,"[78, 103, 243, 316]",2
1,part32,jj-39Vkt8jo_1,2.0,4.0,"[78, 103, 178, 212, 243, 337, 377]",2
2,part32,jj-39Vkt8jo_2,4.0,6.0,"[178, 212, 243, 337, 377]",2
3,part32,jj-39Vkt8jo_3,6.0,8.0,"[178, 212, 243, 377]",2
4,part32,jj-39Vkt8jo_4,8.0,10.0,"[178, 243, 353]",2


In [9]:
metadata.to_csv(f"{os.path.dirname(META_FILE)}/cross_valid.csv", index=False)

In [15]:
output_file = '/media/ubuntu/HD/Data/Audioset-Seg/data_cut_logmel_2.0s_hdf5/fold0_data_pack.h5'
with h5py.File(output_file, "r") as h5_file:
    print("H5文件中的数据集名称:", list(h5_file.keys()))
    print(h5_file["datasets"].shape[0])
    print(h5_file["labels"].shape)
    print(h5_file["labels"][:16])
    print(type(h5_file["labels"][0]))

H5文件中的数据集名称: ['datasets', 'labels']
46999
(46999,)
[array([178, 413], dtype=int32) array([18], dtype=int32)
 array([ 18, 124, 377], dtype=int32) array([239, 254], dtype=int32)
 array([275, 415], dtype=int32) array([ 32, 239], dtype=int32)
 array([239, 254], dtype=int32)
 array([ 93, 114, 239, 243, 254], dtype=int32)
 array([ 78, 103, 254, 422], dtype=int32)
 array([ 46, 157, 200, 243], dtype=int32)
 array([ 36, 239, 443], dtype=int32) array([ 18, 254, 387], dtype=int32)
 array([243], dtype=int32) array([ 46, 242], dtype=int32)
 array([ 18,  46, 178, 230], dtype=int32)
 array([103, 114, 239, 380], dtype=int32)]
<class 'numpy.ndarray'>


In [13]:
temp_data = {
    "id": list(range(100)),
    'file': [f'f_{i}' for i in list(range(100))],
    'h5_index': [-1] * 100
}
temp_data = pd.DataFrame(temp_data)

shuffle_index = np.random.permutation(temp_data.shape[0])
tiff_files = [temp_data['file'].tolist()[i] for i in shuffle_index]
temp_data.loc[temp_data.index[shuffle_index], 'h5_index'] = list(range(temp_data.shape[0]))

print(tiff_files[:10])
temp_data = temp_data.sort_values('h5_index')
temp_data.head(10)

['f_51', 'f_14', 'f_21', 'f_36', 'f_4', 'f_31', 'f_28', 'f_70', 'f_57', 'f_38']


Unnamed: 0,id,file,h5_index
51,51,f_51,0
14,14,f_14,1
21,21,f_21,2
36,36,f_36,3
4,4,f_4,4
31,31,f_31,5
28,28,f_28,6
70,70,f_70,7
57,57,f_57,8
38,38,f_38,9
