In [None]:
# 구글 드라이브 데이터 불러오는 코드
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
import os
import json
import soundfile as sf
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [None]:
# 1. Wav2Vec2 모델 로드
embedding_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
embedding_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

In [None]:
# 2. 감정 분석 모델 로드
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
import torch.nn as nn

class RegressionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class EmotionModel(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)
        return hidden_states, logits

emotion_model = EmotionModel.from_pretrained("audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim").to("cuda")


config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/661M [00:00<?, ?B/s]

In [None]:
# 3. 오디오 처리 함수
def load_audio(file_path):
    speech, sr = sf.read(file_path)
    if len(speech.shape) > 1:  # 스테레오 → 모노
        speech = speech.mean(axis=1)
    if sr != 16000:
        raise ValueError("Sampling rate must be 16000 Hz")
    return speech

def extract_embeddings(audio_path, processor, model):
    speech = load_audio(audio_path)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to("cuda")
    with torch.no_grad():
        outputs = model(input_values)
        hidden_states = outputs.last_hidden_state
    return hidden_states

def predict_emotion(embeddings, model):
    pooled_embeddings = torch.mean(embeddings, dim=1)
    with torch.no_grad():
        _, logits = model(pooled_embeddings)
    return logits.cpu().numpy()

In [None]:
# 4. 폴더 내 파일 처리 및 JSON 저장
def process_audio_files(base_dir, output_json):
    results = {}
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                try:
                    # 임베딩 추출
                    embeddings = extract_embeddings(file_path, embedding_processor, embedding_model)

                    # 감정 분석
                    emotion_logits = predict_emotion(embeddings, emotion_model)

                    # 결과 저장
                    results[file] = {
                        "file_path": file_path,
                        "emotions": {
                            "arousal": float(emotion_logits[0][0]),
                            "dominance": float(emotion_logits[0][1]),
                            "valence": float(emotion_logits[0][2]),
                        }
                    }
                except Exception as e:
                    results[file] = {
                        "file_path": file_path,
                        "error": str(e)
                    }

    # JSON 저장
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    print(f"Results saved to {output_json}")

In [None]:
# 5. 메인 함수
def main():
    base_dir = "/content/drive/MyDrive/personal_study/"
    output_json = "emotion_results.json"
    process_audio_files(base_dir, output_json)

if __name__ == "__main__":
    main()

Results saved to emotion_results.json
