In [1]:
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import torchaudio
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import os
import soundfile as sf
import pickle
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. HuBERT 특성 추출기 정의
class HuBERTFeatureExtractor:
    def __init__(self, model_name="facebook/hubert-base-ls960"):
        # HuBERT 모델과 프로세서 초기화
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = HubertModel.from_pretrained(model_name)
        self.model.eval()  # 평가 모드로 설정

    def load_audio(self, audio_file):
        # 오디오 파일 로드
        waveform, sample_rate = torchaudio.load(audio_file)
        
        return waveform, sample_rate

    def preprocess_audio(self, waveform, sample_rate, target_sample_rate=16000, max_length=10):
        # 모노로 변환
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # 샘플링 레이트 변환
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            waveform = resampler(waveform)
            
        max_samples = target_sample_rate * max_length
        if waveform.size(1) > max_samples:
            waveform = waveform[:, :max_samples]
            
        return waveform

    def extract_features(self, audio_file):
        # 오디오 로드 및 전처리
        waveform, sample_rate = self.load_audio(audio_file)
        waveform = self.preprocess_audio(waveform, sample_rate)
        
        # 입력 형태 확인 및 조정
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        elif waveform.dim() == 2:
            if waveform.size(0) > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
        elif waveform.dim() == 3:
            waveform = waveform.squeeze(0)
            if waveform.size(0) > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
        
        # 특성 추출
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values = inputs.input_values
        
        # 불필요한 차원 제거
        input_values = input_values.squeeze(1)  # (batch_size=1, sequence_length)
        
        with torch.no_grad():
            outputs = self.model(input_values)
            
        features = outputs.last_hidden_state
        
        return features

In [3]:
# 2. 데이터셋 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, audio_files, labels, feature_extractor, label_encoder_path="./label_encoder.pkl"):
        self.audio_files = audio_files
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)
        
        # LabelEncoder 초기화
        if label_encoder_path and os.path.exists(label_encoder_path):
            with open(label_encoder_path, 'rb') as f:
                self.label_encoder = pickle.load(f)
            print(f"LabelEncoder loaded from {label_encoder_path}")
        else:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(labels)
            if label_encoder_path:
                with open(label_encoder_path, 'wb') as f:
                    pickle.dump(self.label_encoder, f)
                print(f"LabelEncoder saved to {label_encoder_path}")

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        label = self.labels[idx]
        
        # HuBERT로 음성 특성 추출
        features = self.feature_extractor.extract_features(audio_file)
        features = features.squeeze(0)
        
        # 라벨 인코딩
        label = self.label_encoder.transform([label])[0]
        
        return features, label

In [4]:
# 3. Transformer 모델 정의 (감정 인식용)
class EmotionTransformer(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(EmotionTransformer, self).__init__()
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=16),
            num_layers=8
        )
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x, src_key_padding_mask=None):
        # Transformer를 통해 특성 추출
        x = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
        x = x[-1]  # Sequence의 평균을 사용
        output = self.fc(x)
        return output

In [5]:
# 4. 모델 학습 함수
def train_model(train_dataloader, model, criterion, optimizer, num_epochs=10, save_interval=1, batch_interval=10):
    device = torch.device("mps")
    model = model.to(device)
    
    # 저장 디렉토리 생성
    # os.makedirs(save_dir, exist_ok=True)

    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs} started.")
        
        for batch_idx, (features, labels) in enumerate(train_dataloader, start=1):
            features, labels = features.to(device), labels.to(device)
            
            # 모델에 입력
            features = features.permute(1, 0, 2)  # (seq_len, batch, features), Transformer의 입력 차원에 맞게 차원 변환
            labels = labels.long()
            
            optimizer.zero_grad() # 경사도 초기화
            
            # 예측
            outputs = model(features)
            
            # 손실 계산
            loss = criterion(outputs, labels)
            
            # 역전파
            loss.backward()
            optimizer.step()
            
            # 현재 배치 상태 출력
            if batch_idx % batch_interval == 0:  # 10번째 배치마다 출력
                print(f"Epoch {epoch + 1}, Batch {batch_idx}/{len(train_dataloader)}, Loss: {loss.item():.4f}")
                
            # 배치마다 모델 덮어쓰기 저장
            if batch_idx % save_interval == 0:
                # save_path = os.path.join(save_dir, f"model_epoch_{epoch + 1}_batch_{batch_idx}.pth")
                torch.save(model, "./model.pth")
                print(f"Model saved")

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

In [6]:
# 폴더 안의 폴더 이름 리스트
def get_folder_names(directory):
    try:
        folder_names = sorted(name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name)))
        return folder_names
    except FileNotFoundError:
        print(f"디렉터리 '{directory}'를 찾을 수 없습니다.")
        return []
    except PermissionError:
        print(f"디렉터리 '{directory}'에 접근할 수 없습니다.")
        return []

# 폴더 안의 파일 이름 리스트
def get_file_names(directory):
    try:
        file_names = [name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))]
        return file_names
    except FileNotFoundError:
        print(f"디렉터리 '{directory}'를 찾을 수 없습니다.")
        return []
    except PermissionError:
        print(f"디렉터리 '{directory}'에 접근할 수 없습니다.")
        return []

In [7]:
def collate_fn(batch):
    features = [item[0] for item in batch]  # 각 샘플의 feature 추출
    labels = torch.tensor([item[1] for item in batch])  # 각 샘플의 label 추출
    
    # 시퀀스 패딩 적용 (seq_len을 동일하게 맞춤)
    features = pad_sequence(features, batch_first=True)  # (batch_size, max_seq_len, feature_dim)
    
    return features, labels

def create_padding_mask(features):
    return (features.sum(dim=-1) == 0)

In [8]:
# 5. 학습 데이터 준비
dataset_dir = "./dataset/015.감성 및 발화 스타일별 음성합성 데이터/01.데이터/1.Training/원천데이터/" # 오디오 파일이 저장된 폴더 경로

audio_files = []
labels = []
emotions = ['기쁨', '슬픔', '분노', '불안', '상처', '당황', '중립']

for i in range(4):
    audio_path = dataset_dir + 'TS1/1.감정/' + str(i + 1) + '.' + emotions[i]
    
    dir_list = get_folder_names(audio_path)
    
    for j in dir_list:
        audio_dir = audio_path + '/' + j
        file_list = get_file_names(audio_dir)
        
        for file in file_list:
            audio_files.append(audio_dir + '/' + file)
            labels.append(emotions[i])
for i in range(3):
    audio_path = dataset_dir + 'TS2/1.감정/' + str(i + 5) + '.' + emotions[i + 4]
    
    dir_list = get_folder_names(audio_path)
    
    for j in dir_list:
        audio_dir = audio_path + '/' + j
        file_list = get_file_names(audio_dir)
        
        for file in file_list:
            audio_files.append(audio_dir + '/' + file)
            labels.append(emotions[i + 4])

In [9]:
train_audio_files = random.sample(audio_files, 453965)
train_labels = random.sample(labels, 453965)

In [10]:
print(len(train_audio_files), len(train_labels), len(audio_files), len(labels))

453965 453965 453965 453965


In [11]:
print(train_labels[:100])

['중립', '당황', '불안', '상처', '불안', '슬픔', '중립', '상처', '기쁨', '분노', '중립', '슬픔', '당황', '중립', '중립', '슬픔', '기쁨', '분노', '상처', '기쁨', '중립', '불안', '분노', '슬픔', '당황', '상처', '분노', '상처', '당황', '당황', '불안', '분노', '중립', '슬픔', '당황', '기쁨', '상처', '기쁨', '슬픔', '당황', '분노', '당황', '불안', '당황', '분노', '당황', '중립', '상처', '당황', '분노', '중립', '당황', '불안', '불안', '상처', '당황', '당황', '중립', '상처', '당황', '상처', '상처', '당황', '분노', '기쁨', '분노', '불안', '상처', '중립', '당황', '당황', '불안', '기쁨', '기쁨', '상처', '불안', '슬픔', '기쁨', '분노', '중립', '슬픔', '상처', '불안', '당황', '중립', '상처', '상처', '중립', '기쁨', '상처', '슬픔', '슬픔', '불안', '분노', '상처', '중립', '불안', '상처', '당황', '불안']


In [12]:
# 6. 모델 학습 준비
feature_extractor = HuBERTFeatureExtractor()
dataset = EmotionDataset(train_audio_files, train_labels, feature_extractor)
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# 모델, 손실 함수, 최적화 함수 정의
model = EmotionTransformer(input_dim=768, num_classes=len(set(train_labels)))  # HuBERT의 출력 차원은 768
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


LabelEncoder loaded from ./label_encoder.pkl




In [13]:
train_model(train_dataloader, model, criterion, optimizer, num_epochs=1, save_interval=1, batch_interval=1)

Epoch 1/1 started.
Epoch 1, Batch 1/14187, Loss: 2.0949
Model saved
Epoch 1, Batch 2/14187, Loss: 3.2894
Model saved
Epoch 1, Batch 3/14187, Loss: 3.6010
Model saved
Epoch 1, Batch 4/14187, Loss: 2.4786
Model saved
Epoch 1, Batch 5/14187, Loss: 2.0279
Model saved
Epoch 1, Batch 6/14187, Loss: 2.2764
Model saved
Epoch 1, Batch 7/14187, Loss: 2.1141
Model saved
Epoch 1, Batch 8/14187, Loss: 2.3278
Model saved
Epoch 1, Batch 9/14187, Loss: 2.1489
Model saved
Epoch 1, Batch 10/14187, Loss: 2.1413
Model saved
Epoch 1, Batch 11/14187, Loss: 2.0939
Model saved
Epoch 1, Batch 12/14187, Loss: 1.9904
Model saved
Epoch 1, Batch 13/14187, Loss: 1.9003
Model saved
Epoch 1, Batch 14/14187, Loss: 2.1418
Model saved
Epoch 1, Batch 15/14187, Loss: 2.1710
Model saved
Epoch 1, Batch 16/14187, Loss: 2.0987
Model saved
Epoch 1, Batch 17/14187, Loss: 2.0539
Model saved
Epoch 1, Batch 18/14187, Loss: 1.9690
Model saved
Epoch 1, Batch 19/14187, Loss: 1.9892
Model saved
Epoch 1, Batch 20/14187, Loss: 2.0274
Mo