In [1]:
# 구글 드라이브 데이터 불러오는 코드
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [3]:
import os
import json
import torch
import soundfile as sf
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [4]:
# 언어별 모델 정보
models_info = {
    "portuguese": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
    "russian": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
    "chinese": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
    "dutch": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch",
    "japanese": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese"
}

In [5]:
# 감정 분석 모델 로드
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
import torch.nn as nn

class RegressionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class EmotionModel(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)
        return hidden_states, logits

emotion_model = EmotionModel.from_pretrained("audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim").to("cuda")


In [6]:
# 오디오 파일 로드
def load_audio(file_path):
    speech, sr = sf.read(file_path)
    if len(speech.shape) > 1:  # 스테레오 → 모노
        speech = speech.mean(axis=1)
    if sr != 16000:
        raise ValueError("Sampling rate must be 16000 Hz")
    return speech


In [7]:
# 임베딩 추출
def extract_embeddings(audio_path, processor, model):
    speech = load_audio(audio_path)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to("cuda")
    with torch.no_grad():
        outputs = model(input_values)
        hidden_states = outputs.last_hidden_state
    return hidden_states

In [8]:
# 감정 분석
def predict_emotion(embeddings, model):
    pooled_embeddings = torch.mean(embeddings, dim=1)
    with torch.no_grad():
        _, logits = model(pooled_embeddings)
    return logits.cpu().numpy()

In [9]:
# 폴더 내 모든 파일 처리 및 JSON 저장
def process_audio_files(base_dir, model_name, processor, embedding_model, output_json):
    results = {}
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                try:
                    # 임베딩 추출
                    embeddings = extract_embeddings(file_path, processor, embedding_model)

                    # 감정 분석
                    emotion_logits = predict_emotion(embeddings, emotion_model)

                    # 결과 저장
                    results[file] = {
                        "file_path": file_path,
                        "emotions": {
                            "arousal": float(emotion_logits[0][0]),
                            "dominance": float(emotion_logits[0][1]),
                            "valence": float(emotion_logits[0][2]),
                        }
                    }
                except Exception as e:
                    results[file] = {
                        "file_path": file_path,
                        "error": str(e)
                    }

    # JSON 저장
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    print(f"Results saved to {output_json}")


In [10]:
# 메인 함수
def main():
    base_dir = "/content/drive/MyDrive/personal_study/"

    for lang, model_id in models_info.items():
        print(f"Processing {lang} model...")
        processor = Wav2Vec2Processor.from_pretrained(model_id)
        embedding_model = Wav2Vec2Model.from_pretrained(model_id).to("cuda")

        output_json = f"emotion_results_{lang}.json"
        process_audio_files(base_dir, model_id, processor, embedding_model, output_json)

if __name__ == "__main__":
    main()

Processing portuguese model...
Results saved to emotion_results_portuguese.json
Processing russian model...


preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Results saved to emotion_results_russian.json
Processing chinese model...


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/44.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

Results saved to emotion_results_chinese.json
Processing dutch model...


preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Results saved to emotion_results_dutch.json
Processing japanese model...


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/29.3k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Results saved to emotion_results_japanese.json
