# ※각 포매팅 코드는 파일 확장자명을 바꾸기 때문에 돌리실 때 조심하셔야 합니다.

### 한국어 말뭉치 2023 (일상대화) RTTM Formatting

In [None]:
import os
import json

def json_to_rttm(json_file, json_dir, output_dir):
    """JSON 파일을 읽고 RTTM 포맷으로 변환 후 저장"""
    json_path = os.path.join(json_dir, json_file)
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    rttm_lines = []
    data_name = os.path.splitext(json_file)[0]  

    for speech in data.get('document', [{}])[0].get('utterance', []):
        start_ms = float(speech.get('start', 0))
        end_ms = float(speech.get('end', 0))
        duration = round(end_ms - start_ms, 5)
        speaker_id = speech.get('speaker_id', 'unknown')

        # RTTM 포맷 라인 생성
        rttm_line = f"SPEAKER {data_name} 1 {start_ms:.3f} {duration:.3f} <NA> <NA> {speaker_id} <NA>"
        rttm_lines.append(rttm_line)

    # RTTM 파일 저장
    rttm_file = os.path.join(output_dir, f"{data_name}.rttm")
    with open(rttm_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(rttm_lines) + "\n")

    print(f"✅ RTTM 파일 생성 완료: {rttm_file}")

# 디렉토리 설정
json_dir = "kor/json/"
output_dir = "kor/rttm/"

# 출력 디렉토리 생성 (존재하지 않으면)
os.makedirs(output_dir, exist_ok=True)

# JSON 파일 리스트 가져오기
json_files = [f for f in os.listdir(json_dir) if f.startswith('SDRW') and f.endswith('.json')]

# 변환 실행
for json_file in json_files:
    json_to_rttm(json_file, json_dir, output_dir)

### Alimeeting RTTM Formatting

In [None]:
import os
from praatio import textgrid

def tg_to_rttm(tg_file, text_dir, output_dir):
    tg_path = os.path.join(text_dir, tg_file)
    
    tg = textgrid.openTextgrid(tg_path, False)
    all_log = []
    for name in tg.tierNames:
        entries = tg._tierDict[name].entries
        for entry in entries:
            duration = round(float(entry.end) - float(entry.start), 2)
            all_log.append((entry.start, duration, name, entry.label))

    all_log.sort(key=lambda x: x[0])

    rttm_lines = []
    data_name = tg_file.replace(".TextGrid", "")

    for i, log in enumerate(all_log):
        start_time, duration, speaker_id, label = log
        rttm_line = f"SPEAKER {data_name} 1 {start_time:.2f} {duration:.2f} <NA> <NA> {speaker_id} <NA>"
        rttm_lines.append(rttm_line)

    # RTTM 파일 저장
    rttm_file = os.path.join(output_dir, f"{data_name}.rttm")
    with open(rttm_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(rttm_lines) + "\n")

    print(f"✅ RTTM 파일 생성 완료: {rttm_file}")
    
text_dir = "Train_Ali_far/textgrid_dir/"
output_dir = "Train_Ali_far/rttm/"

os.makedirs(output_dir, exist_ok=True)

tg_files = [f for f in os.listdir(text_dir) if f.endswith('.TextGrid')]

for tg_file in tg_files:
    tg_to_rttm(tg_file, text_dir, output_dir)


### Alimeeting 

#### Alimeeting 오디오파일명이랑, rttm명이 달라서 작성한 코드입니다.

In [None]:
import os

# Alimeeting 오디오파일명이랑, rttm명이 달라서 작성한 코드입니다.
# 오디오 파일 이름 맞추기기, 돌릴 때 조심/한번만 돌리기.
for file in os.listdir('Train_Ali_far/audio_dir'):
    file_name = file.split('.')[0]
    new_file_name = file_name.split('_')[0] + "_" + file_name.split('_')[1] + ".wav"
    os.rename('Train_Ali_far/audio_dir/' + file, 'Train_Ali_far/audio_dir/' + new_file_name)

#### Alimeeting을 train, dev, test 셋으로 나누는 코드입니다.

In [None]:
# Alimeeting을 train, dev, test 셋으로 나누는 코드입니다.

rttm_dir  = "Train_Ali_far/rttm"
train_dir = "Train_Ali_far/rttm/train/"
test_dir = "Train_Ali_far/rttm/test/"
dev_dir = "Train_Ali_far/rttm/dev/"

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(dev_dir, exist_ok=True)

tg_files = [f for f in os.listdir(rttm_dir) if f.endswith('.rttm')]

# train, test, dev split
train_count = int(len(tg_files) * 0.8)
test_count = int(len(tg_files) * 0.1)
dev_count = len(tg_files) - train_count - test_count

train_files = tg_files[:train_count]
test_files = tg_files[train_count:train_count + test_count]
dev_files = tg_files[train_count + test_count:]


with open("Train_Ali_far/train_list.txt", "w", encoding="utf-8") as f:
    for i, file in enumerate(train_files):
        f.write(file.split('.')[0] + "\n")


# test list txt file
with open("Train_Ali_far/test_list.txt", "w", encoding="utf-8") as f:
    for i, file in enumerate(test_files):
        f.write(file.split('.')[0] + "\n")

# dev list txt file
with open("Train_Ali_far/dev_list.txt", "w", encoding="utf-8") as f:
    for i, file in enumerate(dev_files):
        f.write(file.split('.')[0] + "\n")

# move files
for f in train_files:
    os.rename(os.path.join(rttm_dir, f), os.path.join(train_dir, f))

for f in test_files:
    os.rename(os.path.join(rttm_dir, f), os.path.join(test_dir, f))

for f in dev_files:
    os.rename(os.path.join(rttm_dir, f), os.path.join(dev_dir, f))
    

#### uem 파일을 만드는 코드입니다.
- 번거롭게 작성되었는데, 나중에 필요하단 것을 알게되어 그렇습니다. 죄송합니다.

In [None]:
from pydub import AudioSegment
import os

os.mkdir('Train_Ali_far/uem')
os.mkdir('Train_Ali_far/uem/train')
os.mkdir('Train_Ali_far/uem/test')
os.mkdir('Train_Ali_far/uem/dev')

def get_duration(file):
    audio = AudioSegment.from_wav(file)
    return audio.duration_seconds

#get train list file name
train_list = []
with open('Train_Ali_far/train_list.txt', 'r', encoding='utf-8') as f:
    train_list = f.readlines()
train_list = [x.strip() for x in train_list]

#get test list file name
test_list = []
with open('Train_Ali_far/test_list.txt', 'r', encoding='utf-8') as f:
    test_list = f.readlines()
test_list = [x.strip() for x in test_list]

#get dev list file name
dev_list = []
with open('Train_Ali_far/dev_list.txt', 'r', encoding='utf-8') as f:
    dev_list = f.readlines()
dev_list = [x.strip() for x in dev_list]


for file in os.listdir('Train_Ali_far/audio_dir'):
    file_name = file.split('.')[0]
    duration = get_duration('Train_Ali_far/audio_dir/' + file) 

    if file_name in train_list:
        #generate uem file
        with open('Train_Ali_far/uem/train/' + file_name + '.uem', 'w', encoding='utf-8') as f:
            f.write(file_name + ' 1 ' + '0.00 ' + str(duration))
    
    elif file_name in test_list:
        #generate uem file
        with open('Train_Ali_far/uem/test/' + file_name + '.uem', 'w', encoding='utf-8') as f:
            f.write(file_name + ' 1 ' + '0.00 ' + str(duration))
    
    elif file_name in dev_list:
        #generate uem file
        with open('Train_Ali_far/uem/dev/' + file_name + '.uem', 'w', encoding='utf-8') as f:
            f.write(file_name + ' 1 ' + '0.00 ' + str(duration))