In [None]:
import os
import json
from itertools import count
import heapq
import soundfile as sf
import librosa


In [2]:
from LibriSpeechEntity import LibriSpeechEntity



In [3]:
INPUTDIR = "/mnt/d/voicedata/test-clean/LibriSpeech/test-clean"
N_WORKERS = os.cpu_count() or 4

In [4]:
speakerchapter_dict = {}
OUT_DIR = '../datasets/LongSpeech'

In [5]:

# read from metadata
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))

AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']

counter = count(start=config['amount'])

In [6]:
for speakers in os.listdir(INPUTDIR):
    for chapter in os.listdir(os.path.join(INPUTDIR, speakers)):
        speaker_chapter_key = speakers + '#' + chapter
        duration = 0
        segment_amount = 0
        for wavs in os.listdir(os.path.join(INPUTDIR, speakers, chapter)):

            if not wavs.endswith(".flac"):
                continue

            audio_path = os.path.join(INPUTDIR, speakers, chapter, wavs)

            info = sf.info(audio_path)
            dur  = info.frames / info.samplerate
            duration += dur
            segment_amount += 1
        speakerchapter_dict[speaker_chapter_key] = (duration, 0, segment_amount)



print(speakerchapter_dict)

{'1089#134686': (276.21006250000005, 0, 38), '1089#134691': (206.8500625, 0, 26), '1188#133604': (491.93493750000016, 0, 45), '121#121726': (79.09, 0, 15), '121#123852': (76.64500000000001, 0, 5), '121#123859': (93.15499999999999, 0, 5), '121#127105': (231.69499999999996, 0, 37), '1221#135766': (176.6, 0, 16), '1221#135767': (307.745, 0, 25), '1284#1180': (227.9, 0, 33), '1284#1181': (146.97999999999996, 0, 22), '1284#134647': (114.5550625, 0, 8), '1320#122612': (129.125, 0, 17), '1320#122617': (352.1499375000001, 0, 42), '1580#141083': (257.0300625, 0, 54), '1580#141084': (227.25000000000003, 0, 51), '1995#1826': (164.43506250000002, 0, 27), '1995#1836': (142.08499999999998, 0, 15), '1995#1837': (177.23506250000003, 0, 30), '2094#142345': (485.64499999999975, 0, 61), '2300#131720': (491.195, 0, 42), '237#126133': (166.965, 0, 26), '237#134493': (115.01500000000001, 0, 19), '237#134500': (199.50493749999998, 0, 43), '260#123286': (173.63, 0, 32), '260#123288': (204.04506250000006, 0, 2

In [7]:
# sort by total duration
speakerchapter_dict = dict(sorted(speakerchapter_dict.items(), key=lambda item: item[1], reverse=True))
speakerchapter_dict

{'5639#40744': (496.68993750000004, 0, 42),
 '672#122797': (496.1649999999999, 0, 75),
 '8230#279154': (494.80000000000007, 0, 44),
 '1188#133604': (491.93493750000016, 0, 45),
 '2300#131720': (491.195, 0, 42),
 '7729#102255': (489.94506250000006, 0, 47),
 '2094#142345': (485.64499999999975, 0, 61),
 '3575#170457': (483.6251874999999, 0, 57),
 '4507#16021': (483.295, 0, 60),
 '8455#210777': (481.8549999999999, 0, 71),
 '3729#6852': (481.68999999999994, 0, 47),
 '2830#3980': (390.30506249999996, 0, 77),
 '1320#122617': (352.1499375000001, 0, 42),
 '8224#274381': (331.1649999999999, 0, 18),
 '1221#135767': (307.745, 0, 25),
 '2961#960': (281.9850625, 0, 23),
 '61#70968': (280.99493750000005, 0, 63),
 '7176#92135': (279.87000000000006, 0, 46),
 '4970#29095': (278.30999999999995, 0, 39),
 '1089#134686': (276.21006250000005, 0, 38),
 '4077#13751': (273.26000000000005, 0, 22),
 '908#157963': (266.47999999999996, 0, 31),
 '5142#33396': (264.015, 0, 69),
 '7127#75947': (263.26493750000003, 0, 

In [8]:
heap = [(-dur, key) for key, (dur, _, _) in speakerchapter_dict.items()]
heapq.heapify(heap)
long_entities = []

In [9]:
def get_audio_file_name(key, used_seg):
    speaker = key.split('#')[0]
    chapter = key.split('#')[1]
    return os.path.join(speaker, chapter, f"{speaker}-{chapter}-{used_seg:04d}.flac")

def get_transcribe(key, used_seg):
    speaker = key.split('#')[0]
    chapter = key.split('#')[1]
    fileindex = f"{speaker}-{chapter}-{used_seg:04d}"
    transcribe_path = os.path.join(INPUTDIR, speaker, chapter, f"{speaker}-{chapter}.trans.txt")
    with open(transcribe_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith(fileindex):
                return line.strip().split(' ', 1)[1]
    return None




In [13]:
MAX_PARTS = 5

In [25]:
with OUT_FILE_NAME.open("a") as output_f:
    while heap:                                  # ❶ 堆里还有章节
        ent = LibriSpeechEntity(next(counter))   # 先建一条长音频

        # ❷ 不断取“剩余最长”的 chapter 给同一条 entity 填料
        while len(ent.components) < MAX_PARTS and heap:
            neg_dur, key = heapq.heappop(heap)
            remain_dur, used_seg, total_seg = speakerchapter_dict[key]
            speaker, chapter = key.split('#')

            while used_seg <= total_seg and len(ent.components) < MAX_PARTS:
                filename   = f"{speaker}-{chapter}-{used_seg:04d}.flac"
                audio_path = os.path.join(INPUTDIR, speaker, chapter, filename)
                audio, src = librosa.load(audio_path, sr=SAMPLE_RATE)
                text       = get_transcribe(key, used_seg)

                if not ent.appendaudio(audio, text, filename):
                    break

                seg_len_sec  = len(audio) / SAMPLE_RATE
                remain_dur  -= seg_len_sec
                used_seg    += 1

            # ----- 废物利用 -----
            speakerchapter_dict[key] = (remain_dur, used_seg, total_seg)
            if used_seg <= total_seg:                     # 章里还有料
                heapq.heappush(heap, (-remain_dur, key))


        if ent.finished:
            wav_path = ent.export_wav()
            output_f.write(ent.get_metadata() + "\n")
            print(f"[OK] {wav_path}  {ent.duration_sec:.1f}s  {len(ent.components)} seg")
        else:
            break

AttributeError: 'str' object has no attribute 'open'