In [1]:
pip install hgtk

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import hgtk
import re

In [3]:
# 기본 경로 설정 (폴더의 최상위 경로로 변경하세요)
base_dir = '/mnt/c'

# 데이터 저장을 위한 리스트 초기화
data = []

# os.walk를 사용하여 모든 하위 디렉토리 탐색
for root, dirs, files in os.walk(base_dir):
    # 각 파일에 대해 처리
    for file in files:
        # 파일 확장자에 따라 처리 분기
        if file.endswith('.txt'):
            txt_path = os.path.join(root, file)
            wav_path = os.path.join(root, file.replace('.txt', '.wav'))

            # 대응되는 .wav 파일이 있는지 확인
            if os.path.exists(wav_path):
                # 텍스트 파일 읽기
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text = f.read().strip()

                # 한글 자음과 모음으로 분리
                decomposed_text = hgtk.text.decompose(text, compose_code='')

                # 데이터 추가
                data.append({
                    'wav_path': wav_path,
                    'decomposed_text': decomposed_text
                })

# 데이터프레임 생성
df = pd.DataFrame(data)

# 결과 확인
print(df.head())

                                            wav_path  \
0  /mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...   
1  /mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...   
2  /mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...   
3  /mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...   
4  /mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...   

                                     decomposed_text  
0  o/ n/ ㄴㅔ ㄱㅏㅁㅅㅏㅎㅏㅂㄴㅣㄷㅏ. (NCS)/(ㅇㅔㄴㅆㅣㅇㅔㅅㅡ) ㄱㅛㅇㅠㄱ...  
1                                 o/ n/ ㄴㅔ ㅇㅕㅂㅗㅅㅔㅇㅛ.  
2  o/ n/ ㅇㅏ ㄴㅔ ㅈㅓㄱㅣ ㄱㅡ (NCS)/(ㅇㅔㄴㅆㅣㅇㅔㅅㅡ) ㅇㅣㄴㅅㅏㄷㅏㅁ...  
3  o/ n/ ㅇㅏ ㄴㅔ ㅁㅏㅈㅅㅡㅂㄴㅣㄷㅏ. ㅎㅗㅁㅍㅔㅇㅣㅈㅣㅇㅔ ㄴㅏㅇㅘㅇㅣㅆㄴㅡㄴ...  
4                            o/ n/ ㄴㅔ ㅇㅏㄹㄱㅔㅆㅅㅡㅂㄴㅣㄷㅏ.  


In [None]:
df.to_csv('/mnt/c/datasets.csv', index=False, encoding='utf-8-sig')

In [2]:
df = pd.read_csv('/mnt/c/datasets.csv')

In [5]:
def remove_non_korean(text):
    cleaned_text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [8]:
df.head()

Unnamed: 0,wav_path,decomposed_text
0,/mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...,ㄴㅔ ㄱㅏㅁㅅㅏㅎㅏㅂㄴㅣㄷㅏ ㅇㅔㄴㅆㅣㅇㅔㅅㅡ ㄱㅛㅇㅠㄱㄱㅘㅈㅓㅇ ㅁㅜㄴㅇㅢ ㅊㅔㅎ...
1,/mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...,ㄴㅔ ㅇㅕㅂㅗㅅㅔㅇㅛ
2,/mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...,ㅇㅏ ㄴㅔ ㅈㅓㄱㅣ ㄱㅡ ㅇㅔㄴㅆㅣㅇㅔㅅㅡ ㅇㅣㄴㅅㅏㄷㅏㅁㄷㅏㅇㅈㅏ ㄱㅣㅂㅗㄴ ㅅㅣ...
3,/mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...,ㅇㅏ ㄴㅔ ㅁㅏㅈㅅㅡㅂㄴㅣㄷㅏ ㅎㅗㅁㅍㅔㅇㅣㅈㅣㅇㅔ ㄴㅏㅇㅘㅇㅣㅆㄴㅡㄴ ㅇㅓ ㄴㅔ ...
4,/mnt/c/Users/tkd39/stt/1.Training/D03/J13/S000...,ㄴㅔ ㅇㅏㄹㄱㅔㅆㅅㅡㅂㄴㅣㄷㅏ


In [5]:
print(len(df))

50000


In [4]:
df = df.head(50000)

In [6]:
df['decomposed_text'] = df['decomposed_text'].apply(remove_non_korean)

In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [8]:
from transformers import Wav2Vec2CTCTokenizer

# 분해된 텍스트에서 고유한 문자 추출
all_chars = set(''.join(train_df['decomposed_text']))
vocab_dict = {char: idx for idx, char in enumerate(all_chars)}

# 스페셜 토큰 추가
vocab_dict["[PAD]"] = len(vocab_dict)
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["|"] = len(vocab_dict)  # 공백 문자 대체

# vocab.json 저장
import json

with open('vocab.json', 'w', encoding='utf-8') as f:
    json.dump(vocab_dict, f, ensure_ascii=False)


In [9]:
from transformers import Wav2Vec2FeatureExtractor

# feature_extractor 정의
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)


In [11]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")


In [12]:
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
import logging

# 로그 설정
logging.basicConfig(filename='error.log', level=logging.ERROR)

def speech_file_to_array_fn(path):
    try:
        speech_array, sampling_rate = torchaudio.load(path)
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        speech = speech_array.squeeze().numpy()
        return speech
    except Exception as e:
        logging.error(f"Error processing {path}: {e}")
        return None

def apply_with_progress(df, func):
    speech_list = []
    for path in tqdm(df['wav_path'], desc='Processing audio files', leave=False):
        speech = func(path)
        speech_list.append(speech)
    df['speech'] = speech_list
    return df

# 데이터 적용
train_df = apply_with_progress(train_df, speech_file_to_array_fn)
test_df = apply_with_progress(test_df, speech_file_to_array_fn)

# None 값 제거
train_df = train_df[train_df['speech'].notnull()]
test_df = test_df[test_df['speech'].notnull()]



                                                                              

In [16]:
from datasets import Dataset, DatasetDict, Audio

# 필요한 컬럼만 포함된 데이터프레임 생성
train_df_small = train_df[['wav_path', 'decomposed_text']]
test_df_small = test_df[['wav_path', 'decomposed_text']]

# 데이터셋 생성
train_dataset = Dataset.from_pandas(train_df_small, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df_small, preserve_index=False)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# 오디오 컬럼을 Audio 타입으로 캐스팅
dataset = dataset.cast_column("wav_path", Audio(sampling_rate=16000))

# 전처리 함수 정의
def prepare_dataset(batch):
    # 오디오 데이터 로드 및 처리
    speech_list = []
    for audio in batch["wav_path"]:
        speech_list.append(audio["array"])
    batch["input_values"] = feature_extractor(speech_list, sampling_rate=16000).input_values
    # 레이블 인코딩
    batch["labels"] = [tokenizer(text).input_ids for text in batch["decomposed_text"]]
    return batch

# 데이터셋 전처리
dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset["train"].column_names,
    batched=True,
    batch_size=100,
)

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

2024-10-26 18:56:10.213242: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
# 전처리된 데이터셋을 저장
dataset.save_to_disk("/mnt/processed_dataset")

Saving the dataset (0/49 shards):   0%|          | 0/45000 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
from datasets import load_from_disk

# 저장된 데이터셋 불러오기
dataset = load_from_disk("/mnt/processed_dataset")

Loading dataset from disk:   0%|          | 0/49 [00:00<?, ?it/s]

In [10]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor

# 토크나이저 로드
tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|"
)

# 특징 추출기 로드
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)


In [11]:
from transformers import Wav2Vec2ForCTC

# 모델 로드
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer)
)

# 모델의 토크나이저 및 특징 추출기 설정
model.config.gradient_checkpointing = True
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = len(tokenizer)


2024-10-26 19:10:58.675715: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from dataclasses import dataclass
from typing import Dict, List, Union
import torch

@dataclass
class DataCollatorCTCWithPadding:
    feature_extractor: Wav2Vec2FeatureExtractor
    tokenizer: Wav2Vec2CTCTokenizer
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[torch.Tensor, List[float]]]]) -> Dict[str, torch.Tensor]:
        # 입력 값과 레이블을 분리
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # 입력 값 패딩
        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # 레이블 패딩
        labels_batch = self.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # 레이블에서 패딩된 토큰을 -100으로 설정 (CTC Loss에서 무시)
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer,
    padding=True
)


In [13]:
import evaluate
import numpy as np

# 평가 지표 로드
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # 예측 결과 디코딩
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # 레이블 디코딩
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # WER 및 CER 계산
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}



In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/mnt/d/datasets/wav2vec2-finetuned-ko",
    group_by_length=True,
    per_device_train_batch_size=8,  # GPU 메모리에 따라 조절하세요
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=5,
    fp16=True,  # GPU가 FP16을 지원하지 않으면 False로 설정
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False
)




In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=feature_extractor
)

# 학습 시작
trainer.train()


  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Wer,Cer
500,6.3367,3.202706,1.0,0.997713
1000,6.3166,3.177886,1.0,0.997715
1500,6.2586,3.175218,1.0,0.99417
2000,6.2608,3.10869,1.0,0.990072
2500,6.158,3.084275,0.999987,0.98613
3000,6.0992,3.041531,1.0,0.981696
3500,4.8059,1.950138,0.99546,0.541087
4000,2.6501,1.007917,0.79078,0.252954
4500,2.1708,0.794418,0.672461,0.198241
5000,1.9327,0.683179,0.602976,0.17088


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=14060, training_loss=2.808842104427506, metrics={'train_runtime': 24962.5605, 'train_samples_per_second': 9.013, 'train_steps_per_second': 0.563, 'total_flos': 5.837557399850187e+19, 'train_loss': 2.808842104427506, 'epoch': 4.999111111111111})

In [19]:
# 모델 평가
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Evaluation Results: {'eval_loss': 0.3861437141895294, 'eval_wer': 0.371920900362434, 'eval_cer': 0.09783614914198224, 'eval_runtime': 258.1407, 'eval_samples_per_second': 19.369, 'eval_steps_per_second': 2.421, 'epoch': 4.999111111111111}


In [20]:
# 모델 저장
trainer.save_model("/mnt/wav2vec2-finetuned-ko")
tokenizer.save_pretrained("/mnt/wav2vec2-finetuned-ko")
feature_extractor.save_pretrained("/mnt/wav2vec2-finetuned-ko")

['/mnt/d/datasets/wav2vec2-finetuned-ko/preprocessor_config.json']