In [5]:
import json

def merge_coco_json(train_json_path, valid_json_path, output_json_path):
    with open(train_json_path, 'r') as train_file:
        train_data = json.load(train_file)
    
    with open(valid_json_path, 'r') as valid_file:
        valid_data = json.load(valid_file)
    
    # Merge images
    merged_data = train_data
    merged_data['images'].extend(valid_data['images'])
    
    # Merge annotations
    annotation_id_offset = max([ann['id'] for ann in train_data['annotations']]) + 1
    for ann in valid_data['annotations']:
        ann['id'] += annotation_id_offset
    merged_data['annotations'].extend(valid_data['annotations'])
    
    # Merge categories
    merged_data['categories'] = train_data['categories']
    
    with open(output_json_path, 'w') as output_file:
        json.dump(merged_data, output_file, indent=4)

if __name__ == "__main__":
    train_json_path = "/pls use your own path/datasets/images/train/_annotations.coco.json"
    valid_json_path = "/pls use your own path/datasets/images/valid/_annotations.coco.json"
    output_json_path = "/pls use your own path/0805/highQ_annotations.coco.json"
    
    merge_coco_json(train_json_path, valid_json_path, output_json_path)

In [26]:
!pip install audio-metadata

Collecting audio-metadata
  Downloading audio_metadata-0.11.1-py3-none-any.whl (41 kB)
     |████████████████████████████████| 41 kB 73 kB/s             
[?25hCollecting attrs<19.4,>=18.2
  Downloading attrs-19.3.0-py2.py3-none-any.whl (39 kB)
Collecting tbm-utils<3.0,>=2.3
  Downloading tbm_utils-2.6.0-py3-none-any.whl (12 kB)
Collecting more-itertools<9.0,>=4.0
  Downloading more_itertools-8.14.0-py3-none-any.whl (52 kB)
     |████████████████████████████████| 52 kB 113 kB/s            
[?25hCollecting bitstruct<9.0,>=6.0
  Downloading bitstruct-8.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)
     |████████████████████████████████| 80 kB 701 kB/s            
[?25hCollecting pendulum!=2.0.5,!=2.1.0,<=3.0,>=2.0
  Downloading pendulum-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (385 kB)
     |████████████████████████████████| 385 kB 1.6 MB/s            
[?25hCollecting bidict<1.0.0
  Downloading bidict-0.23.1-py3-none-any.whl (32 kB)
Collec

In [28]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
     |████████████████████████████████| 260 kB 254 kB/s            
[?25hCollecting msgpack>=1.0
  Downloading msgpack-1.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (385 kB)
     |████████████████████████████████| 385 kB 2.7 MB/s            
Collecting soundfile>=0.12.1
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 6.3 MB/s            
Collecting pooch>=1.1
  Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
     |████████████████████████████████| 64 kB 232 kB/s             
[?25hCollecting soxr>=0.3.2
  Downloading soxr-0.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
     |████████████████████████████████| 1.3 MB 17.8 MB/s            
Collecting typing-extensions>=4.1.1
  Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting lazy-loader>=0.1
  Downloading lazy_loader-

In [2]:
import os
import json

directory = '/pls use your own path/datasets/tests'

filenames = [os.path.splitext(file)[0] for file in os.listdir(directory) if file.endswith('.png')]

json_filename = '/pls use your own path/nonuse/200filenames.json'
with open(json_filename, 'w') as json_file:
    json.dump(filenames, json_file, indent=4)

print(f"文件名已保存到 {json_filename}")

文件名已保存到 /home/jovyan/work/MusicYOLO/nonuse/200filenames.json


In [32]:
import os
import json
import random
from collections import defaultdict

audio_dir = '/pls use your own path/cough/cough_detection_dataset'
json_file_path = '/pls use your own path/nonuse/200filenames.json'

with open(json_file_path, 'r') as file:
    specified_test_files = json.load(file)

# Ensure that the specified 200 audio file names do not have extensions
specified_test_files = [file.split('.')[0] for file in specified_test_files]

all_files = [file for file in os.listdir(audio_dir) if file.endswith('.wav')]

# Create a dictionary mapping patient IDs to lists of files
patient_files = defaultdict(list)
for file in all_files:
    patient_id = file.split('_')[0]
    patient_files[patient_id].append(file)

# Distribute the specified 200 audio files into the test set
test_files = []
for test_file in specified_test_files:
    for patient_id, files in list(patient_files.items()):
        matched_files = [file for file in files if file.startswith(test_file)]
        if matched_files:
            test_files.extend(matched_files)
            patient_files.pop(patient_id)
            break

# Ensure that the specified 200 audio files have the correct quantity
assert len(test_files) == len(specified_test_files), "指定的200个音频文件数量不正确"

# Randomly select the remaining 300 audio files for the test set
remaining_test_count = 500 - len(test_files)
remaining_patient_ids = list(patient_files.keys())
random.shuffle(remaining_patient_ids)

for patient_id in remaining_patient_ids:
    if remaining_test_count <= 0:
        break
    patient_files_for_id = patient_files[patient_id]
    if len(patient_files_for_id) <= remaining_test_count:
        test_files.extend(patient_files.pop(patient_id))
        remaining_test_count -= len(patient_files_for_id)

# Retrieve all the remaining audio files
remaining_files = []
remaining_patient_ids = list(patient_files.keys())
random.shuffle(remaining_patient_ids)

for patient_id in remaining_patient_ids:
    remaining_files.extend(patient_files[patient_id])

# Ensure that the remaining files are sufficient for allocation to the training and validation sets
assert len(remaining_files) >= 3292 + 500, "剩余文件数量不足以分配到训练集和验证集"

# Divide the remaining audio files by patient into the training and validation sets
train_files = []
valid_files = []

for patient_id in remaining_patient_ids:
    if len(train_files) < 3292:
        train_files.extend(patient_files[patient_id])
    elif len(valid_files) < 500:
        valid_files.extend(patient_files[patient_id])

# Check if the division is correct
assert len(train_files) == 3292, "训练集数量不正确"
assert len(test_files) == 500, "测试集数量不正确"
assert len(valid_files) == 500, "验证集数量不正确"

split_result = {
    "train": train_files,
    "test": test_files,
    "valid": valid_files
}

with open('dddataset_split.json', 'w') as file:
    json.dump(split_result, file, indent=4)

print("数据集划分已完成并保存到 dataset_split.json")

数据集划分已完成并保存到 dataset_split.json


In [33]:
import json
from collections import defaultdict

with open('/pls use your own path/nonuse/dddataset_split.json', 'r') as file:
    split_result = json.load(file)

# Retrieve the patient IDs
def get_patient_id(filename):
    return filename.split('_')[0]

patient_groups = defaultdict(set)

for group, files in split_result.items():
    for file in files:
        patient_id = get_patient_id(file)
        patient_groups[patient_id].add(group)

# Check if any patient IDs appear in multiple groups
errors = []
for patient_id, groups in patient_groups.items():
    if len(groups) > 1:
        errors.append((patient_id, list(groups)))

if errors:
    print("以下患者的音频文件分布在多个组别中:")
    for patient_id, groups in errors:
        print(f"患者ID: {patient_id}, 组别: {groups}")
else:
    print("所有患者的音频文件都正确分配到了相同的组别中。")

所有患者的音频文件都正确分配到了相同的组别中。


In [43]:
import os
import shutil
import json

dataset_dir = '/pls use your own path/cough/cough_detection_dataset'
split_file = '/pls use your own path/nonuse/dddataset_split.json'
output_dir = '/pls use your own path/data4retrain'

train_dir = os.path.join(output_dir, 'train')
test_dir = os.path.join(output_dir, 'test')
valid_dir = os.path.join(output_dir, 'valid')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)

with open(split_file, 'r') as file:
    split_result = json.load(file)

def copy_files(file_list, target_dir):
    count = 0
    for file_name in file_list:
        source_file = os.path.join(dataset_dir, file_name)
        target_file = os.path.join(target_dir, file_name)
        if os.path.exists(source_file):
            shutil.copy(source_file, target_file)
            count += 1
        else:
            print(f"文件 {source_file} 不存在，跳过复制。")
    return count

# copy files and print the number of files
train_count = copy_files(split_result['train'], train_dir)
print(f"训练集文件数量: {train_count}")

test_count = copy_files(split_result['test'], test_dir)
print(f"测试集文件数量: {test_count}")

valid_count = copy_files(split_result['valid'], valid_dir)
print(f"验证集文件数量: {valid_count}")

print("数据集划分已完成并复制到相应的文件夹中。")

训练集文件数量: 3292
测试集文件数量: 500
验证集文件数量: 500
数据集划分已完成并复制到相应的文件夹中。


In [46]:
import os
import shutil
import json

dataset_dir = '/pls use your own path/cough/cough_detection_dataset'
split_file = '/pls use your own path/nonuse/dddataset_split.json'
output_dir = '/pls use your own path/data4retrain'

train_dir = os.path.join(output_dir, 'train')
test_dir = os.path.join(output_dir, 'test')
valid_dir = os.path.join(output_dir, 'valid')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)

with open(split_file, 'r') as file:
    split_result = json.load(file)

def copy_files(file_list, target_dir):
    copied_files = []
    for file_name in file_list:
        source_file = os.path.join(dataset_dir, file_name)
        target_file = os.path.join(target_dir, file_name)
        if os.path.exists(source_file):
            shutil.copy(source_file, target_file)
            copied_files.append(file_name)
        else:
            print(f"文件 {source_file} 不存在，跳过复制。")
    return copied_files

# copy files and print the number of files
train_files = copy_files(split_result['train'], train_dir)
print(f"训练集文件数量: {len(train_files)}")
print("训练集文件名: ", train_files)

test_files = copy_files(split_result['test'], test_dir)
print(f"测试集文件数量: {len(test_files)}")
print("测试集文件名: ", test_files)

valid_files = copy_files(split_result['valid'], valid_dir)
print(f"验证集文件数量: {len(valid_files)}")
print("验证集文件名: ", valid_files)

print("数据集划分已完成并复制到相应的文件夹中。")

# check the number of the files in the target directory
actual_train_files = os.listdir(train_dir)
actual_test_files = os.listdir(test_dir)
actual_valid_files = os.listdir(valid_dir)

print(f"实际训练集文件数量: {len(actual_train_files)}")
print(f"实际测试集文件数量: {len(actual_test_files)}")
print(f"实际验证集文件数量: {len(actual_valid_files)}")

训练集文件数量: 3292
训练集文件名:  ['2023-006-1385_1698893692.wav', '20230214-1676356064.wav', '2023-006-2311_1702522284.wav', '2023-006-2439_1703121709.wav', '2023-006-2851_1703581413.wav', '2023-006-0686_1695019079.wav', '20221115-1668502487.wav', '20221208-1670468862.wav', '2023-008-0541_1698038421.wav', '2023-006-2848_1703581174.wav', '2023-006-0490_1693208558.wav', '20221208-1670463838.wav', '2023-006-1717_1700725955.wav', '2022-016-2184_1677569304.wav', '2022-016-2184_1677569329.wav', '2023-006-1443_1699410486.wav', '2023-006-1232_1698801521.wav', '20221124-1669276589.wav', '20230210-1676009201.wav', '2023-008-0453_1697071998.wav', '20230206-1675666472.wav', '2023-006-2360_1702966194.wav', '2023-008-0819_1702876660.wav', '2023-008-0845_1703051223.wav', '2023-006-1139_1698653597.wav', '2023-004-1511_1685496634.wav', '2023-004-1511_1685496644.wav', '2023-004-1511_1685496651.wav', '20221202-1669951339.wav', '2023-006-2730_1703577841.wav', '2023-006-0672_1694676187.wav', '2023-008-0707_170106363