In [13]:
# EXPLANATION: DEFINITION
# Reads list of AIHUB json files and returns the name of the file that adds up to 1800 sec (= 30 min)

import json
import os

def process_json_files(directory, time):
    total_speech_end = 0
    file_count = 0
    json_files = sorted([f for f in os.listdir(directory) if f.endswith('.json')])

    for filename in json_files:
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename), 'r') as file:
                data = json.load(file)
                if '기타정보' in data and 'SpeechEnd' in data['기타정보']:
                    tt = data['기타정보']['SpeechEnd'] - data['기타정보']['SpeechStart']
                    total_speech_end += tt
                    # print(filename)
                    file_count += 1
                    if total_speech_end > time:
                        return file_count, total_speech_end, filename
    return -1, -1, -1




In [12]:
# EXPLANATION: EXECUTION - TRAINING DATA
# Print out where to stop for training (1800sec = 30min)

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0220_G1A3E7_BYK'
result, tt, filename = process_json_files(json_directory, 1800)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0347_G1A5E7_CKY'
result, tt, filename = process_json_files(json_directory, 1800)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/1538_G2A1E7_JMH'
result, tt, filename = process_json_files(json_directory, 1800)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/9042_G2A6E7_KSB'
result, tt, filename = process_json_files(json_directory, 1800)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

# trust the counts. There seems like some file has skipped numbers.

Count when sum hits above 1800: 823, 1800.1500000000005, 0220_G1A3E7_BYK_000832.json
Count when sum hits above 1800: 670, 1800.9800000000002, 0347_G1A5E7_CKY_000678.json
Count when sum hits above 1800: 744, 1802.2000000000016, 1538_G2A1E7_JMH_000750.json
Count when sum hits above 1800: 741, 1803.169999999999, 9042_G2A6E7_KSB_000741.json


In [11]:
# EXPLANATION: EXECUTION - VALIDATION DATA
# print out where to stop for validation (2250 = 1800 + 1800/4)

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0220_G1A3E7_BYK'
# /home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/원천데이터/VS1/1.³²¼º/2400¹®Àå/0220_G1A3E7_BYK
result, tt, filename = process_json_files(json_directory, 2250)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0347_G1A5E7_CKY'
# /home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/원천데이터/VS1/1.³²¼º/2400¹®Àå/0347_G1A5E7_CKY
result, tt, filename = process_json_files(json_directory, 2250)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/1538_G2A1E7_JMH'
# /home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/원천데이터/VS5/2.¿©¼º/2400¹®Àå/1538_G2A1E7_JMH
result, tt, filename = process_json_files(json_directory, 2250)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

json_directory = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/9042_G2A6E7_KSB'
# /home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/원천데이터/VS5/2.¿©¼º/2400¹®Àå/9042_G2A6E7_KSB
result, tt, filename = process_json_files(json_directory, 2250)
print(f"Count when sum hits above 1800: {result}, {tt}, {filename}")

Count when sum hits above 1800: 1002, 2251.949999999999, 0220_G1A3E7_BYK_001013.json
Count when sum hits above 1800: 837, 2251.6500000000015, 0347_G1A5E7_CKY_000845.json
Count when sum hits above 1800: 926, 2251.220000000001, 1538_G2A1E7_JMH_000932.json
Count when sum hits above 1800: 930, 2252.0699999999997, 9042_G2A6E7_KSB_000930.json


In [29]:
# EXPLANATION: DEFINITION
# reads in AIHUB data's json file and transcribe them to fit KSS style.

import json
import os

def process_json_file(json_file_path, fileno):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    org_label_text = data['전사정보']['OrgLabelText']
    return f"{fileno}|{org_label_text}"

def write_to_file(input_folder, output_folder, last_name):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    json_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.json')])

    tot = []
    for filename in json_files:
        if filename.endswith('.json'):
            json_file_path = os.path.join(input_folder, filename)
            output_text = process_json_file(json_file_path, filename[-11:-5])
            tot.append(output_text)
        if filename == last_name :
            break
        

    output_file_path = os.path.join(output_folder, f"{input_folder[-3:]}.txt")
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for a in tot:
            output_file.write(f'{a}\n')
        print('write successful')



In [30]:
# EXPLANATION: EXECUTION - WRITE TRANSCRIPT
# reads json file and write them in transcrption style


input_folder = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0220_G1A3E7_BYK'
output_folder = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB'
last_name = '0220_G1A3E7_BYK_001013.json' # filename to stop recording
write_to_file(input_folder, output_folder, last_name)

input_folder = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL1/1.³²¼º/2400¹®Àå/0347_G1A5E7_CKY'
output_folder = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB'
last_name = '0347_G1A5E7_CKY_000845.json' # filename to stop recording
write_to_file(input_folder, output_folder, last_name)

input_folder = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/1538_G2A1E7_JMH'
output_folder = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB'
last_name = '1538_G2A1E7_JMH_000932.json' # filename to stop recording
write_to_file(input_folder, output_folder, last_name)

input_folder = '/home/soma1/문서/014.다화자 음성합성 데이터/01.데이터/2.Validation/라벨링데이터/VL5/2.¿©¼º/2400¹®Àå/9042_G2A6E7_KSB'
output_folder = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB'
last_name = '9042_G2A6E7_KSB_000930.json' # filename to stop recording
write_to_file(input_folder, output_folder, last_name)

write successful
write successful
write successful
write successful


In [33]:
## EXPLANATION: DEFINITION
# create .lab files

import os, tqdm, re
from tqdm import tqdm
from jamo import h2j
from glob import glob

def make_lab(text, base_dir):
    filters = '([.,!?])'

    with open(text, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            temp = line.split('|')
            file_dir, script = temp[0], temp[1]
            script = re.sub(re.compile(filters), '', script)
            # file_dir = file_dir.split('/')
            # fn = file_dir[0] + '/' + file_dir[1][:-3] + 'lab' # filename configuration? '1/1_0015.lab'
            file_list = sorted(glob(os.path.join(base_dir, '*.wav')))
            fn = ''
            for ff in file_list:
                # print(ff[-10:-4], file_dir)
                if ff[-10:-4] == file_dir:
                    fn = ff[:-4] + '.lab'
                    break
            # fn = file_dir + '.lab'
            file_dir = os.path.join(base_dir, fn) # base_dir/1/1_0015.lab
            with open(file_dir, 'w', encoding='utf-8') as f:
                f.write(script)
    ###
    file_list = sorted(glob(os.path.join(base_dir, '*.lab')))
    print(file_list)
    ###
    jamo_dict = {}
    for file_name in tqdm(file_list):
        sentence =  open(file_name, 'r', encoding='utf-8').readline()
        jamo = h2j(sentence).split(' ')
        for i, s in enumerate(jamo):
            if s not in jamo_dict:
                jamo_dict[s.rstrip()] = ' '.join(jamo[i].rstrip()) # FREAKING NEWLINE BABY!!
    
    return jamo_dict

In [34]:
# EXPLANATION: DEFINITION AND EXECUTION
# creation of .lab file for MFA
# sourcecode from https://chldkato.tistory.com/195, fixed by SWL

import os, tqdm, re
from tqdm import tqdm
from jamo import h2j
from glob import glob

a = {}
text = 'transcript.v.1.4.txt'
base_dir = 'kss'
filters = '([.,!?])'

text = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK.txt'
base_dir = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK'
a = make_lab(text, base_dir)

text = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY.txt'
base_dir = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY'
b = make_lab(text, base_dir)
a = a | b

text = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH.txt'
base_dir = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH'
b = make_lab(text, base_dir)
a = a | b

text = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB.txt'
base_dir = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB'
b = make_lab(text, base_dir)
a = a | b

###
base_dir = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/'
dict_name = 'new_korean_dict.txt'
with open(os.path.join(base_dir, dict_name), 'w', encoding='utf-8') as f:
    for key in a.keys():
        # content = '{}\t{}\n'.format(key, jamo_dict[key])
        content = f'{key}\t{a[key]}\n'
        f.write(content)

# move onto using mfa commands after this stage

['/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000001.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000002.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000003.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000004.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000005.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000006.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/BYK/0220_G1A3E7_BYK_000007.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/K

100%|██████████| 1002/1002 [00:00<00:00, 29359.69it/s]


['/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000001.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000002.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000003.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000004.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000005.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000006.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/CKY/0347_G1A5E7_CKY_000007.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/K

100%|██████████| 837/837 [00:00<00:00, 28785.59it/s]


['/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000001.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000002.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000003.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000004.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000005.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000006.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/JMH/1538_G2A1E7_JMH_000007.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/K

100%|██████████| 926/926 [00:00<00:00, 28175.83it/s]


['/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000001.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000002.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000003.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000004.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000005.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000006.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB/9042_G2A6E7_KSB_000007.lab', '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/K

100%|██████████| 930/930 [00:00<00:00, 24429.17it/s]


In [None]:
# EXPLANATION: DEFINITION
# https://github.com/hwRG/FastSpeech2-Pytorch-Korean-Multi-Speaker/blob/main/data_preprocessing.py
### Data Preprocessing
## 1. Json to Transcript
## 2. Aligner
## 3. Text Replace

from jamo import h2j 
import json
import os, re, tqdm
import unicodedata
from tqdm import tqdm
import hparams as hp

name = hp.dataset

first_dir = os.getcwd()

transcript = name + '_transcript.txt'
dict_name = name + '_korean_dict.txt'

data_dir = 'wavs'
json_label_dir = 'label'


def change_name(base_dir, format):
    print('Change', format, 'name')
    cnt = 0
    speaker_table = os.listdir(base_dir)
    new_speaker_table = []
    
    for speaker in speaker_table:
        if cnt == 0:
            os.chdir(base_dir)
            
        new_speaker_name = re.sub(r'[^0-9]', '', speaker)
        
        overlap = 1
        while new_speaker_name in new_speaker_table:
            print(new_speaker_name, 'is dangerous')
            new_speaker_name = str(overlap) + new_speaker_name[1:]
            overlap += 1
        
        new_speaker_table.append(re.sub(r'[^0-9]', '', new_speaker_name))
        print(new_speaker_name, 'ok')
        
        temp = 0
        for wav in os.listdir(speaker):
            if temp == 0:
                os.chdir(speaker)
            new_wav_name = re.sub(r'[^0-9]', '', wav)

            # new wav_name을 그대로 사용해야 함
            if new_wav_name[:len(new_speaker_name)] != wav:
                if new_wav_name[:len(new_speaker_name)] == new_speaker_name:
                    new_wav_name = new_wav_name + wav[-(len(format)+1):]
                else:
                    new_wav_name = new_speaker_name + new_wav_name + wav[-(len(format)+1):]
                os.rename(wav, new_wav_name)
            
            temp+=1; cnt +=1
            
        os.chdir('../')
        os.rename(speaker, new_speaker_name)
    print(cnt,'All Done', end='\n\n')
    os.chdir('../')


def json_to_transcripts():
    speakers = os.listdir(json_label_dir)
    speakers.sort()
    print(len(speakers), "speaker's are Sorted.")
    os.chdir(json_label_dir)

    utterance_text = []
    cnt = 1
    for speaker in speakers:
        for file in os.listdir(speaker):
            if cnt % 1000 == 0:
                print(cnt, 'Done')

            utterance_set = []
            with open(os.path.join(speaker, file)) as f:
                json_data = json.load(f)
                utterance_set.append(file[:-4] + 'wav')
                utterance_set.append(line_replace(json_data['발화정보']['stt']))
                
                # unicodedata.normalize로 글자의 위치까지 기록해 나열
                sep_text = unicodedata.normalize('NFD',line_replace(json_data['발화정보']['stt']))
                utterance_set.append(sep_text)
                
                utterance_set.append(round(float(json_data['발화정보']['recrdTime']),1))
                
                utterance_text.append(utterance_set)
            cnt+=1

    print(cnt-1, 'All Done')
    os.chdir('../')
    with open(transcript, "w") as file:
        for utt in utterance_text:
            file.write(utt[0][:6] + '/' + utt[0] + '|' + utt[1] + '|' + utt[1] + '|' +  utt[2] + '|' +  str(utt[3]) + '|' +  'None\n')


def line_replace(line):
    line = line.replace('(SP:)', '')
    line = line.replace('(SP:', '')
    line = line.replace('(SN:)', '')
    line = line.replace('(SN:', '')
    line = line.replace('(NO:)', '')
    line = line.replace('(NO:', '')
    line = line.replace('spn', '')
    line = line.replace('', '')
    line = line.replace('', '')
    line = line.replace('', '')
    line = line.replace('', '')
    line = line.replace('毛', '')
    line = line.replace(')', '')
    line = line.replace('(', '')
    line = line.replace('"', '')
    line = line.replace('.', '')
    line = line.replace('[', '')
    line = line.replace(',', '')
    line = line.replace('!', '')
    line = line.replace('?', '')
    line = line.replace(']', '')
    line = line.replace('.', '')
    line = line.replace('  ', ' ')
    return line

def aligner():
    filters = '([.,!?])"'
    file_list = []

    with open(transcript, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            temp = line.split('|')
            
            file_dir, script = temp[0], temp[3]
            script = re.sub(re.compile(filters), '', script)

            # filters로 걸러지지 않는 항목 추가 제거
            script = line_replace(script) 
            
            
            fn = file_dir[:-3] + 'lab'
            file_dir = os.path.join(data_dir, fn)
            with open(file_dir, 'w', encoding='utf-8') as f:
                f.write(script)

            file_list.append(os.path.join(file_dir))

    with open(transcript, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            temp = line.split('|')
            
            file_list.append(os.path.join(data_dir, temp[0][:-3]) + 'lab')


    jamo_dict = {}
    for file_name in tqdm(file_list):
        sentence =  open(file_name, 'r', encoding='utf-8').readline()
        jamo = h2j(sentence).split(' ')
        
        for i, s in enumerate(jamo):
            if s not in jamo_dict:
                jamo_dict[s] = ' '.join(jamo[i])

    with open(dict_name, 'w', encoding='utf-8') as f:
        for key in jamo_dict.keys():
            content = '{}\t{}\n'.format(key, jamo_dict[key])
            f.write(content)
    print("Aligner Done\n")


def mfa_train():
    print("MFA Training Start.. \n")

    os.system('mfa train_g2p ' + dict_name + ' ' + name + '_korean.zip --clear')
    print("MFA train_g2p Done\n")

    os.system('mfa g2p ' + name + '_korean.zip ' + data_dir + ' ' + name + '_korean.txt')
    print("MFA g2p Done\n")
   
    os.system('mfa train ' + data_dir + ' ' + name + '_korean.txt ./textgrids --clean')
    
    os.system('mv ~/Documents/MFA/wavs_train_acoustic_model/sat_2_ali/textgrids ./')
    os.system('zip -r textgrids.zip textgrids')
    os.system('mv textgrids.zip ' + first_dir) # 압축 후 최상위 디렉토리에 zip 파일로 생성
    print("MFA Training Done! \n")
    

def lab_separate():
    speaker_list = os.listdir('wavs')
    os.mkdir('lab')
    for speaker in speaker_list:
        os.mkdir('lab/' + speaker)
        lab_list = os.listdir(os.path.join('wavs', speaker))
        for lab in lab_list:
            if lab[-3:] == 'lab':
                os.system('mv ' 'wavs/' + speaker + '/' + lab + ' lab/' + speaker)


if __name__ == '__main__':
    os.chdir('dataset/' + hp.dataset)

    # 디렉토리와 파일에 숫자 제외 모두 제거, 중복되지 않도록 조정
    #change_name('wavs', 'wav')
    #change_name('label', 'json')

    # AIHub 데이터 기준 json 데이터를 transcript로 변환
    #json_to_transcripts()

    # 1) mfa를 위해 데이터마다 lab 파일 생성
    aligner()

    # 2) 순차적으로 mfa 수행하여 textgrids 생성 및 압축  
    mfa_train()

    # 3) 디렉토리에 lab 파일 제거
    lab_separate()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/kss'

In [39]:
def process_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    modified_lines = [line.split('|', 1)[1] for line in lines]

    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(modified_lines)

# Specify your input and output file names
input_file_name = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB.txt'
output_file_name = '/home/soma1/문서/swm_team_filo/zolup/Korean-FastSpeech2-Pytorch-swl/Korean-FastSpeech2-Pytorch/AIHUB/KSB_out.txt'

# Process the file
process_file(input_file_name, output_file_name)
