In [1]:
from lhotse import load_manifest, CutSet
import os
import json
from tqdm import tqdm

In [35]:
split = 'test'

In [36]:
cset = load_manifest(f'/export/fs06/xhe69/TS-ASR-Whisper/data/manifests/ami-sdm_{split}_sc_cutset.jsonl.gz')

In [37]:
cset.describe()

Cut statistics:
╒═══════════════════════════╤══════════╕
│ Cuts count:               │ 16       │
├───────────────────────────┼──────────┤
│ Total duration (hh:mm:ss) │ 09:03:45 │
├───────────────────────────┼──────────┤
│ mean                      │ 2039.0   │
├───────────────────────────┼──────────┤
│ std                       │ 536.0    │
├───────────────────────────┼──────────┤
│ min                       │ 839.0    │
├───────────────────────────┼──────────┤
│ 25%                       │ 1812.3   │
├───────────────────────────┼──────────┤
│ 50%                       │ 2176.3   │
├───────────────────────────┼──────────┤
│ 75%                       │ 2337.1   │
├───────────────────────────┼──────────┤
│ 99%                       │ 2919.1   │
├───────────────────────────┼──────────┤
│ 99.5%                     │ 2945.7   │
├───────────────────────────┼──────────┤
│ 99.9%                     │ 2966.9   │
├───────────────────────────┼──────────┤
│ max                       │ 2972.3   │


In [38]:
chunk_duration = 90
overlap_duration = 10
step = chunk_duration - overlap_duration
new_cuts = []

for cut in cset:
    start = 0.0
    while start + chunk_duration <= cut.duration:
        chunk = cut.truncate(
            offset=start,
            duration=chunk_duration,
            preserve_id=False  # Set to True only if you want to keep original ID
        )
        new_cuts.append(chunk)
        start += step
        
print(len(new_cuts))
# Create a new CutSet
chunked_cset = CutSet.from_cuts(new_cuts)


399


In [39]:
chunked_cset.describe()

Cut statistics:
╒═══════════════════════════╤══════════╕
│ Cuts count:               │ 399      │
├───────────────────────────┼──────────┤
│ Total duration (hh:mm:ss) │ 09:58:30 │
├───────────────────────────┼──────────┤
│ mean                      │ 90.0     │
├───────────────────────────┼──────────┤
│ std                       │ 0.0      │
├───────────────────────────┼──────────┤
│ min                       │ 90.0     │
├───────────────────────────┼──────────┤
│ 25%                       │ 90.0     │
├───────────────────────────┼──────────┤
│ 50%                       │ 90.0     │
├───────────────────────────┼──────────┤
│ 75%                       │ 90.0     │
├───────────────────────────┼──────────┤
│ 99%                       │ 90.0     │
├───────────────────────────┼──────────┤
│ 99.5%                     │ 90.0     │
├───────────────────────────┼──────────┤
│ 99.9%                     │ 90.0     │
├───────────────────────────┼──────────┤
│ max                       │ 90.0     │


In [40]:
example = chunked_cset[1]
speakers = set()
for sup in example.supervisions:
    speakers.add(sup.speaker)
    print(sup.start, sup.duration, sup.speaker)
print(len(speakers))

-2.92 5.070000000000007 MTD009PM
2.15 1.8900000000000006 MTD009PM
4.04 2.1699999999999875 MTD009PM
6.21 5.210000000000008 MTD009PM
11.42 2.4099999999999966 MTD009PM
13.83 2.1700000000000017 MTD009PM
16.0 2.0 MTD009PM
19.11 3.0999999999999943 MTD009PM
22.21 5.260000000000005 MTD009PM
29.64 2.9200000000000017 MTD009PM
29.75 0.21999999999999886 MTD011UID
34.8 6.969999999999999 MTD009PM
41.77 2.9200000000000017 MTD009PM
44.69 2.8900000000000006 MTD009PM
47.58 10.11 MTD009PM
57.69 2.180000000000007 MTD009PM
61.37 14.740000000000009 MTD009PM
76.11 1.4399999999999977 MTD009PM
78.44 4.900000000000006 MTD009PM
85.11 8.909999999999997 MTD009PM
2


In [41]:
rttm_dir = f"data/nemo/rttms/ami_sdm_{split}_90schunks"
os.makedirs(rttm_dir, exist_ok=True)

In [44]:
nemo_manifest_path = f"data/nemo/manifests/ami-sdm_{split}_sc_cutset_nemo_90schunks.jsonl"
speaker_durations = [0.0, 0.0, 0.0, 0.0, 0.0]
with open(nemo_manifest_path, "w") as nemo_out:

    for cut in tqdm(chunked_cset):
        recording_id = cut.id
        rttm_lines = []
        num_speakers = 0
        all_texts = []
        
        if cut.supervisions:
            speakers = set()
            for sup in cut.supervisions:
                rttm_line = (
                    f"SPEAKER {recording_id} 1 {sup.start+cut.start:.2f} {sup.duration:.2f} <NA> <NA> {sup.speaker} <NA> <NA>"
                )
                rttm_lines.append(rttm_line)
                speakers.add(sup.speaker)
                if sup.text:
                    all_texts.append(sup.text)
            num_speakers = len(speakers)
        else:
            num_speakers = 0
        
        if num_speakers > 4:
            continue
        # if num_speakers == 1:
            # continue
        # if cut.duration < 5:
        #     continue
        speaker_durations[num_speakers-1] += cut.duration
        # Write RTTM
        rttm_path = os.path.join(rttm_dir, f"{recording_id}.rttm")
        with open(rttm_path, "w") as rttm_out:
            rttm_out.write("\n".join(rttm_lines))

        # Build text field
        combined_text = " ".join(all_texts) if all_texts else "-"

        # Build NeMo entry
        nemo_entry = {
            "audio_filepath": cut.recording.sources[0].source,
            "offset": cut.start,
            "duration": cut.duration,
            "label": "infer",
            "text": combined_text,
            "num_speakers": num_speakers,
            "rttm_filepath": os.path.abspath(rttm_path)
        }
        nemo_out.write(json.dumps(nemo_entry) + "\n")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 399/399 [00:00<00:00, 809.95it/s]


In [45]:
speaker_durations

[990.0, 3060.0, 9180.0, 22410.0, 270.0]

In [None]:
# 30s: [41250.0, 88260.0, 147480.0, 137880.0, 9810.0]
# 90s: [9000.0, 24750.0, 81090.0, 193680.0, 5220.0]