In [1]:
import os, glob, math, json, random, warnings, numpy as np
import librosa, soundfile as sf
from dataclasses import dataclass
warnings.filterwarnings("ignore")
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import MulticlassF1Score, MulticlassConfusionMatrix

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED=42; random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x2357fc984d0>

In [3]:
CLASSES = ["happy","sad","neutral","angry","fear","surprise","disgust"]
IDX = {c:i for i,c in enumerate(CLASSES)}

RAVDESS_EMO_MAP = {
    "01":"neutral","02":"neutral","03":"happy","04":"sad",
    "05":"angry","06":"fear","07":"disgust","08":"surprise"
}

In [4]:
SR=16000
DUR=2.0
SAMPLES=int(SR*DUR)
N_MELS=64

In [5]:
import shutil, re

def ensure_dir(p): os.makedirs(p, exist_ok=True)

PAT = re.compile(r"^(?P<MM>\d{2})-(?P<VC>\d{2})-(?P<EM>\d{2})-(?P<IN>\d{2})-(?P<ST>\d{2})-(?P<RE>\d{2})-(?P<AC>\d{2})\.wav$", re.I)


In [6]:
class SpecCfg:
    sr: int = SR
    dur: float = DUR
    n_mels: int = N_MELS
    hop_length: int = 256
    n_fft: int = 1024
    fmin: int = 50
    fmax: int = 8000
    time_mask: int = 16
    freq_mask: int = 8

CFG = SpecCfg()

In [7]:
def _list_ravdess(root, actors=None):
    """Return list of (path, label) filtered to speech audio, desired actors."""
    items = []
    for actor_dir in sorted(glob.glob(os.path.join(root, "Actor_*"))):
        ac = actor_dir.split("_")[-1]  
        if actors and ac not in actors:  
            continue
        for wav in sorted(glob.glob(os.path.join(actor_dir, "*.wav"))):
            m = PAT.match(os.path.basename(wav))
            if not m: 
                continue
            MM, VC, EM = m["MM"], m["VC"], m["EM"]
            if MM != "03":        
                continue
            if VC != "01":        
                continue
            lab = RAVDESS_EMO_MAP.get(EM)
            if lab not in IDX:
                continue
            items.append((wav, IDX[lab]))
    return items

In [8]:
class RAVDESSMelDataset(Dataset):
    def __init__(self, root, actors_keep=None, augment=False):
        """
        root: path that contains Actor_01 ... Actor_24 folders
        actors_keep: iterable of actor codes like {'01','02',...} or None for all
        augment: use SpecAugment (train only)
        """
        self.items = _list_ravdess(root, actors=set(actors_keep) if actors_keep else None)
        if not self.items:
            raise RuntimeError(f"No RAVDESS wavs found under {root}")
        self.augment = augment

    def __len__(self): return len(self.items)

    def _load_wave(self, path):
        y, sr = sf.read(path, dtype="float32", always_2d=False)
        if y.ndim > 1: y = y.mean(axis=1)           # mono
        if sr != CFG.sr: y = librosa.resample(y, orig_sr=sr, target_sr=CFG.sr)
        if len(y) < SAMPLES: y = np.pad(y, (0, SAMPLES-len(y)))
        else: y = y[:SAMPLES]
        return y

    def _logmel(self, y):
        M = librosa.feature.melspectrogram(
            y=y, sr=CFG.sr, n_mels=CFG.n_mels,
            n_fft=CFG.n_fft, hop_length=CFG.hop_length,
            fmin=CFG.fmin, fmax=CFG.fmax
        )
        L = librosa.power_to_db(M).astype(np.float32)
        # per-sample standardization
        L = (L - L.mean()) / (L.std() + 1e-6)
        if self.augment:
            t = L.shape[1]
            # time mask
            t0 = np.random.randint(0, max(1, t-CFG.time_mask))
            L[:, t0:t0+CFG.time_mask] = 0
            # freq mask
            f0 = np.random.randint(0, max(1, CFG.n_mels-CFG.freq_mask))
            L[f0:f0+CFG.freq_mask, :] = 0
            # small random dB gain
            L = L + np.random.uniform(-2, 2)
        return L

    def __getitem__(self, i):
        path, y_idx = self.items[i]
        y = self._load_wave(path)
        mel = self._logmel(y)
        x = torch.from_numpy(mel).unsqueeze(0)  # (1, n_mels, T)
        return x, torch.tensor(y_idx, dtype=torch.long)

In [19]:
TRAIN_ACTORS = {f"{i:02d}" for i in range(1,21)}
VAL_ACTORS   = {f"{i:02d}" for i in range(21,25)}

root = r"data\processed"

train_ds = RAVDESSMelDataset(root, actors_keep=TRAIN_ACTORS, augment=True)
val_ds   = RAVDESSMelDataset(root, actors_keep=VAL_ACTORS,   augment=False)

from torch.utils.data import DataLoader
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True,   pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False,  pin_memory=True)

print(len(train_ds), len(val_ds))


1200 240


In [12]:
class TinyCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(), nn.Linear(64, n_classes)
        )
    def forward(self, x): return self.net(x)

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = TinyCNN(len(CLASSES)).to(device)

In [14]:
counts = np.zeros(len(CLASSES), int)
for _, y in train_ds:
    counts[y.item()] += 1
w = (counts.sum() / np.maximum(1, counts)).astype(np.float32)
w = w / w.mean()
W = torch.tensor(w, device=device)

In [15]:
crit = nn.CrossEntropyLoss(weight=W, label_smoothing=0.05)
opt  = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))

In [16]:
def eval_loop():
    model.eval(); tot=0; right=0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            with torch.cuda.amp.autocast(enabled=(device=="cuda")):
                logits = model(x)
            pred = logits.argmax(1)
            right += (pred==y).sum().item(); tot += y.numel()
    return right/tot

In [None]:
best=0.0; patience=0
for ep in range(1, 100):
    model.train()
    running=0.0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            logits = model(x); loss = crit(logits, y)
        scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
        running += loss.item()

    acc = eval_loop()
    print(f"Epoch {ep:02d} | loss {running/len(train_loader):.4f} | val_acc {acc:.3f}")
    if acc>best:
        best=acc; patience=0
        os.makedirs("models/voice", exist_ok=True)
        torch.save({"model":model.state_dict(),"classes":CLASSES,"sr":SR,"n_mels":N_MELS}, "models/voice/tinycnn.pt")
        print("✅ saved models/voice/tinycnn.pt")
    else:
        patience += 1
        if patience >= 6:
            print("Early stopping.")
            break

Epoch 01 | loss 1.9514 | val_acc 0.133
✅ saved models/voice/tinycnn.pt
Epoch 02 | loss 1.9516 | val_acc 0.133
Epoch 03 | loss 1.9484 | val_acc 0.133
Epoch 04 | loss 1.9492 | val_acc 0.133
Epoch 05 | loss 1.9486 | val_acc 0.133
Epoch 06 | loss 1.9478 | val_acc 0.163
✅ saved models/voice/tinycnn.pt
Epoch 07 | loss 1.9482 | val_acc 0.154
Epoch 08 | loss 1.9485 | val_acc 0.133
Epoch 09 | loss 1.9472 | val_acc 0.133
Epoch 10 | loss 1.9482 | val_acc 0.175
✅ saved models/voice/tinycnn.pt
Epoch 11 | loss 1.9468 | val_acc 0.158
Epoch 12 | loss 1.9463 | val_acc 0.142
Epoch 13 | loss 1.9451 | val_acc 0.192
✅ saved models/voice/tinycnn.pt
Epoch 14 | loss 1.9427 | val_acc 0.142
Epoch 15 | loss 1.9397 | val_acc 0.242
✅ saved models/voice/tinycnn.pt
Epoch 16 | loss 1.9415 | val_acc 0.150
Epoch 17 | loss 1.9404 | val_acc 0.208
Epoch 18 | loss 1.9380 | val_acc 0.133
Epoch 19 | loss 1.9326 | val_acc 0.254
✅ saved models/voice/tinycnn.pt
Epoch 20 | loss 1.9335 | val_acc 0.125
Epoch 21 | loss 1.9277 | val

In [23]:
for ep in range(1, 150):
    model.train()
    running=0.0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            logits = model(x); loss = crit(logits, y)
        scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
        running += loss.item()

    acc = eval_loop()
    print(f"Epoch {ep:02d} | loss {running/len(train_loader):.4f} | val_acc {acc:.3f}")
    if acc>best:
        best=acc; patience=0
        os.makedirs("models/voice", exist_ok=True)
        torch.save({"model":model.state_dict(),"classes":CLASSES,"sr":SR,"n_mels":N_MELS}, "models/voice/tinycnn.pt")
        print("✅ saved models/voice/tinycnn.pt")
    else:
        patience += 1
        if patience >= 50:
            print("Early stopping.")
            break

Epoch 01 | loss 1.8363 | val_acc 0.279
Epoch 02 | loss 1.8213 | val_acc 0.300
Epoch 03 | loss 1.8136 | val_acc 0.254
Epoch 04 | loss 1.7962 | val_acc 0.279
Epoch 05 | loss 1.8181 | val_acc 0.321
Epoch 06 | loss 1.8089 | val_acc 0.275
Epoch 07 | loss 1.8078 | val_acc 0.283
Epoch 08 | loss 1.8157 | val_acc 0.346
Epoch 09 | loss 1.8119 | val_acc 0.258
Epoch 10 | loss 1.8000 | val_acc 0.283
Epoch 11 | loss 1.8143 | val_acc 0.188
Epoch 12 | loss 1.8267 | val_acc 0.212
Epoch 13 | loss 1.7957 | val_acc 0.233
Epoch 14 | loss 1.7888 | val_acc 0.263
Epoch 15 | loss 1.7874 | val_acc 0.338
Epoch 16 | loss 1.7863 | val_acc 0.237
Epoch 17 | loss 1.7960 | val_acc 0.312
Epoch 18 | loss 1.7977 | val_acc 0.292
Epoch 19 | loss 1.7803 | val_acc 0.308
Epoch 20 | loss 1.7824 | val_acc 0.342
Epoch 21 | loss 1.7689 | val_acc 0.300
Epoch 22 | loss 1.7902 | val_acc 0.287
Epoch 23 | loss 1.7961 | val_acc 0.271
Epoch 24 | loss 1.7684 | val_acc 0.296
Epoch 25 | loss 1.7864 | val_acc 0.287
Epoch 26 | loss 1.7862 | 

In [27]:
for ep in range(1, 2500):
    model.train()
    running=0.0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            logits = model(x); loss = crit(logits, y)
        scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
        running += loss.item()

    acc = eval_loop()
    print(f"Epoch {ep:02d} | loss {running/len(train_loader):.4f} | val_acc {acc:.3f}")
    if acc>best:
        best=acc; patience=0
        os.makedirs("models/voice", exist_ok=True)
        torch.save({"model":model.state_dict(),"classes":CLASSES,"sr":SR,"n_mels":N_MELS}, "models/voice/tinycnn.pt")
        print("✅ saved models/voice/tinycnn.pt")
    else:
        patience += 1

Epoch 01 | loss 1.4123 | val_acc 0.412
Epoch 02 | loss 1.3927 | val_acc 0.442
Epoch 03 | loss 1.3796 | val_acc 0.429
Epoch 04 | loss 1.4002 | val_acc 0.412
Epoch 05 | loss 1.3754 | val_acc 0.454
Epoch 06 | loss 1.3995 | val_acc 0.446
Epoch 07 | loss 1.3986 | val_acc 0.483
✅ saved models/voice/tinycnn.pt
Epoch 08 | loss 1.3977 | val_acc 0.400
Epoch 09 | loss 1.3868 | val_acc 0.412
Epoch 10 | loss 1.3727 | val_acc 0.438
Epoch 11 | loss 1.3810 | val_acc 0.425
Epoch 12 | loss 1.3656 | val_acc 0.454
Epoch 13 | loss 1.3825 | val_acc 0.467
Epoch 14 | loss 1.3928 | val_acc 0.450
Epoch 15 | loss 1.3929 | val_acc 0.433
Epoch 16 | loss 1.4186 | val_acc 0.412
Epoch 17 | loss 1.3945 | val_acc 0.446
Epoch 18 | loss 1.3799 | val_acc 0.429
Epoch 19 | loss 1.3927 | val_acc 0.417
Epoch 20 | loss 1.4028 | val_acc 0.475
Epoch 21 | loss 1.3880 | val_acc 0.450
Epoch 22 | loss 1.3814 | val_acc 0.433
Epoch 23 | loss 1.3894 | val_acc 0.438
Epoch 24 | loss 1.3991 | val_acc 0.421
Epoch 25 | loss 1.3828 | val_acc

In [28]:
dummy = torch.randn(1,1,N_MELS, 128).to(device)
onnx_path = "models/voice/tinycnn.onnx"
torch.onnx.export(model.eval(), dummy, onnx_path,
                  input_names=["input"], output_names=["logits"],
                  opset_version=17,
                  dynamic_axes={"input":{0:"batch", 3:"time"}, "logits":{0:"batch"}})
print("Exported:", onnx_path)


W1023 10:48:04.553000 31080 Lib\site-packages\torch\onnx\_internal\exporter\_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 17 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features
W1023 10:48:08.828000 31080 Lib\site-packages\torch\onnx\_internal\exporter\_registration.py:107] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Obtain model graph for `TinyCNN([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `TinyCNN([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...


The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 17).


[torch.onnx] Translate the graph into ONNX... ✅


Failed to convert the model to the target version 17 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "g:\Work\Github Projects\Emotion-Driven Music Recommendation System\emotion-music-reco\models\voice\.venv\Lib\site-packages\onnxscript\version_converter\__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
        func=_partial_convert_version, model=model
    )
  File "g:\Work\Github Projects\Emotion-Driven Music Recommendation System\emotion-music-reco\models\voice\.venv\Lib\site-packages\onnxscript\version_converter\_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
  File "g:\Work\Github Projects\Emotion-Driven Music Recommendation System\emotion-music-reco\models\voice\.venv\Lib\site-packages\onnxscript\version_converter\__init__.py", line 122, in _partial_convert_version
    return onnx.version_converter.convert_version(
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        proto, target

Applied 1 of general pattern rewrite rules.
Exported: models/voice/tinycnn.onnx
