In [8]:
import torch
import librosa
import numpy as np
import sys
import torch.nn as nn

print("imports complete")

imports complete


In [None]:
class EmotionDetector(nn.Module):
    def __init__(self, layers, channels_in, channels_out, kernel_size, num_classes=6, n_fft=2048):
        super().__init__()

        self.convs = nn.ModuleList()

        self.layers = layers
        self.channels_out = channels_out
        self.channels_in = channels_in
        self.kernel_size = kernel_size
        self.channels_mult = channels_out

        for i in range(layers):
            self.convs.append(
                nn.Sequential(
                    nn.Conv2d(self.channels_in, self.channels_out, self.kernel_size, padding="same"),
                    nn.BatchNorm2d(self.channels_out),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(2),
                )
            )
            self.channels_in = self.channels_out
            self.channels_out = self.channels_mult * self.channels_in

        self.freq_pool = nn.AdaptiveAvgPool2d((None, 256))
        # self.lstm_input_size = 64 * 8
        # self.rnn = nn.GRU(self.lstm_input_size, 16, batch_first=True, bidirectional=True)
        # self.fc = nn.Linear(16*2, num_classes)
        self.fc1 = nn.Linear(262144, num_classes)

    def forward(self, x):
        batch_size = x.size(0)

        for conv in self.convs:
            x = conv(x)
        # x = self.freq_pool(x)
        x = self.freq_pool(x)

        x = x.flatten(start_dim=1)
        # x = x.permute(0, 3, 1, 2).contiguous().view(batch_size, x.size(3), -1)
        # x, _ = self.rnn(x)
        # x = x.mean(dim=1)
        x = self.fc1(x)
        return x

MODEL_PATH = "emotion2_checkpoint.pt"
CLASSES = ["angry", "disgust", "fear", "happy", "neutral", "sad"]
SAMPLE_RATE = 16000
N_FFT = 2048
HOP_LENGTH = 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model(path):
    model = torch.load(path, map_location=DEVICE, weights_only=False)
    model.eval()
    return model

def process_audio(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    stft = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH))
    spec_db = librosa.amplitude_to_db(stft, ref=np.max)
    spec_db = (spec_db - spec_db.mean()) / (spec_db.std() + 1e-6)
    spec_tensor = torch.tensor(spec_db).unsqueeze(0).unsqueeze(0).float().to(DEVICE)
    return spec_tensor

def classify(model, spec_tensor):
    with torch.no_grad():
        outputs = model(spec_tensor)
        predicted = torch.argmax(outputs, dim=1).item()
    return CLASSES[predicted]

if __name__ == "__main__":
    audio_path = "C:\\Users\The Factory\Documents\GPU-Accelerated-Notebooks\MAIS-hacks\crema-d-mirror\AudioMP3\\1001_DFA_SAD_XX.mp3"
    
    model = load_model(MODEL_PATH)
    spec_tensor = process_audio(audio_path)
    prediction = classify(model, spec_tensor)

    print(f"predicted class: {prediction}")

predicted class: fear
