
# CREMA-D - CNN Baseline

This notebook implements a lightweight, interpretable CNN baseline on **CREMA-D** using 128-band log-mel spectrograms.

**What it does**
- Loads CREMA-D `.wav` files from a path you set (`DATA_ROOT`)
- Extracts 128-mel log spectrograms (resampled to 16 kHz)
- 4 convolutional blocks (32->64->128->256) + dense with dropout
- Adam optimizer, early stopping, simple augmentations (time shift, noise, SpecAugment)
- Stratified train/val/test split and final report



In [None]:

# Optional: install dependencies (run locally if needed)
# %pip install torch torchaudio librosa scikit-learn numpy matplotlib


In [None]:

import os, random
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Config

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DATA_ROOT = "/Users/swathiarasu/Desktop/Courses/Sem 3/CS 7150/Emotion Detection/AudioWAV"  

SAMPLE_RATE = 16_000
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 256
WIN_LENGTH = 1024
FMIN = 20
FMAX = 8000
DURATION = 2.5     

BATCH_SIZE = 32
LR = 1e-3
EPOCHS = 50
PATIENCE = 6          # early stopping
VAL_SPLIT = 0.1
TEST_SPLIT = 0.1

EMO_MAP = {"ANG":0, "DIS":1, "FEA":2, "HAP":3, "NEU":4, "SAD":5}
INV_EMO = {v:k for k,v in EMO_MAP.items()}

# Utils

def parse_label_from_filename(fname: str):
    base = Path(fname).stem
    parts = base.split("_")
    emo = parts[2] if len(parts) > 2 else None
    if emo not in EMO_MAP:
        return None
    return EMO_MAP[emo]

def load_wav_centered(path, target_sr=SAMPLE_RATE, duration=DURATION):
    wav, sr = librosa.load(path, sr=target_sr, mono=True)
    target_len = int(duration * target_sr)
    if len(wav) >= target_len:
        start = (len(wav) - target_len) // 2
        wav = wav[start:start+target_len]
    else:
        pad = target_len - len(wav)
        left = pad // 2
        right = pad - left
        wav = np.pad(wav, (left, right), mode="reflect")
    return wav

def wav_to_logmel(wav, sr=SAMPLE_RATE):
    S = librosa.feature.melspectrogram(
        y=wav, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH,
        n_mels=N_MELS, fmin=FMIN, fmax=FMAX, power=2.0
    )
    logS = librosa.power_to_db(S, ref=np.max)
    # per-utterance standardization
    mean = logS.mean()
    std = logS.std() + 1e-6
    logS = (logS - mean) / std
    return logS.astype(np.float32)

# Augmentations
def augment_waveform(wav, sr=SAMPLE_RATE, p_shift=0.5, p_noise=0.5):
    # time shift
    if random.random() < p_shift:
        max_shift = int(0.1 * len(wav))
        shift = random.randint(-max_shift, max_shift)
        wav = np.roll(wav, shift)
    # add light noise
    if random.random() < p_noise:
        noise_amp = 0.005 * np.random.uniform() * (np.amax(np.abs(wav)) + 1e-6)
        wav = wav + noise_amp * np.random.normal(size=wav.shape[0])
    return wav

def spec_augment(mel, num_time_masks=1, num_freq_masks=1, max_time_mask=0.1, max_freq_mask=0.1):
    m = mel.copy()
    n_mels, T = m.shape
    # freq masks
    for _ in range(num_freq_masks):
        f = int(max(1, max_freq_mask * n_mels * np.random.rand()))
        f0 = random.randint(0, max(0, n_mels - f))
        m[f0:f0+f, :] = 0.0
    # time masks
    for _ in range(num_time_masks):
        t = int(max(1, max_time_mask * T * np.random.rand()))
        t0 = random.randint(0, max(0, T - t))
        m[:, t0:t0+t] = 0.0
    return m

# Dataset

class CremaMelDataset(Dataset):
    def __init__(self, files, train=True):
        self.files = files
        self.train = train

    def __len__(self): return len(self.files)

    def __getitem__(self, idx):
        fpath = self.files[idx]
        y = parse_label_from_filename(fpath)
        wav = load_wav_centered(fpath, SAMPLE_RATE, DURATION)
        if self.train:
            wav = augment_waveform(wav, SAMPLE_RATE, p_shift=0.7, p_noise=0.7)
        mel = wav_to_logmel(wav, SAMPLE_RATE)
        if self.train:
            mel = spec_augment(mel, num_time_masks=1, num_freq_masks=1)
        mel = torch.from_numpy(mel)[None, :, :]  # (1, n_mels, T)
        return mel, y

# Model: 4 conv blocks 

class CNNBaseline(nn.Module):
    def __init__(self, n_classes=6, in_ch=1, dropout=0.3):
        super().__init__()
        self.block1 = nn.Sequential(
            nn.Conv2d(in_ch, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2,2)),
            nn.Dropout(dropout),
        )
        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2,2)),
            nn.Dropout(dropout),
        )
        self.block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2,2)),
            nn.Dropout(dropout),
        )
        self.block4 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2,2)),
            nn.Dropout(dropout),
        )
        self.gap = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(256, 256)
        self.drop_fc = nn.Dropout(0.5)
        self.out = nn.Linear(256, 6)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.gap(x).squeeze(-1).squeeze(-1)  
        x = F.relu(self.fc1(x))
        x = self.drop_fc(x)
        return self.out(x)

# Early Stopping

class EarlyStopper:
    def __init__(self, patience=PATIENCE, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best = None
        self.count = 0
        self.stopped = False

    def step(self, metric):
        if self.best is None or metric < self.best - self.min_delta:
            self.best = metric
            self.count = 0
        else:
            self.count += 1
            if self.count >= self.patience:
                self.stopped = True
        return self.stopped

# Train / Eval loops

def train_one_epoch(model, loader, opt, criterion):
    model.train()
    total, correct, loss_sum = 0, 0, 0.0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), torch.tensor(yb).to(DEVICE)
        opt.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        opt.step()
        loss_sum += loss.item() * xb.size(0)
        preds = logits.argmax(1)
        total += yb.size(0)
        correct += (preds == yb).sum().item()
    return loss_sum/total, correct/total

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    all_preds, all_labels = [], []
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), torch.tensor(yb).to(DEVICE)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss_sum += loss.item() * xb.size(0)
        preds = logits.argmax(1)
        total += yb.size(0)
        correct += (preds == yb).sum().item()
        all_preds.append(preds.cpu().numpy())
        all_labels.append(yb.cpu().numpy())
    import numpy as np
    avg_loss = loss_sum/total
    acc = correct/total
    return avg_loss, acc, np.concatenate(all_preds), np.concatenate(all_labels)

def collect_files(root):
    wavs = sorted(glob(os.path.join(root, "*.wav")))
    files = [f for f in wavs if parse_label_from_filename(f) is not None]
    return files

def stratified_split(files):
    labels = [parse_label_from_filename(f) for f in files]
    X_train, X_temp, y_train, y_temp = train_test_split(
        files, labels, test_size=VAL_SPLIT+TEST_SPLIT, random_state=SEED, stratify=labels
    )
    rel = (VAL_SPLIT+TEST_SPLIT)
    val_ratio = VAL_SPLIT / rel
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=1 - val_ratio, random_state=SEED, stratify=y_temp
    )
    return X_train, X_val, X_test

def main():
    files = collect_files(DATA_ROOT)
    print(f"Found {len(files)} usable wav files.")
    if len(files) == 0:
        raise FileNotFoundError("No usable .wav files found. Check DATA_ROOT and filename format.")
    X_train, X_val, X_test = stratified_split(files)
    print(f"Split sizes -> train: {len(X_train)} | val: {len(X_val)} | test: {len(X_test)}")

    train_ds = CremaMelDataset(X_train, train=True)
    val_ds   = CremaMelDataset(X_val,   train=False)
    test_ds  = CremaMelDataset(X_test,  train=False)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    model = CNNBaseline(n_classes=len(EMO_MAP)).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    history = {
        "epoch":      [],
        "train_loss": [],
        "val_loss":   [],
        "train_acc":  [],
        "val_acc":    [],
    }

    stopper = EarlyStopper(patience=PATIENCE, min_delta=0.0)
    best_state = None
    best_val = float("inf")

    for epoch in range(1, EPOCHS+1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, criterion)
        va_loss, va_acc, _, _ = evaluate(model, val_loader, criterion)
        print(f"Epoch {epoch:02d}: train_loss={tr_loss:.4f} acc={tr_acc:.3f} | val_loss={va_loss:.4f} acc={va_acc:.3f}")

        history["epoch"].append(epoch)
        history["train_loss"].append(tr_loss)
        history["val_loss"].append(va_loss)
        history["train_acc"].append(tr_acc)
        history["val_acc"].append(va_acc)

        if va_loss < best_val:
            best_val = va_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

        if stopper.step(va_loss):
            print(f"Early stopping at epoch {epoch}.")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    pd.DataFrame(history).to_csv("cnn_history.csv", index=False)
    print("Saved cnn_history.csv")

    # final test evaluation
    te_loss, te_acc, preds, labels = evaluate(model, test_loader, criterion)
    print(f"\nTest: loss={te_loss:.4f} acc={te_acc:.3f}")
    print(classification_report(labels, preds, target_names=[INV_EMO[i] for i in range(len(EMO_MAP))]))
    report = classification_report(labels, preds, target_names=[INV_EMO[i] for i in range(len(EMO_MAP))])
    with open("classification_report_cnn.txt", "w") as f:
        f.write(report)
    print("Saved CNN classification report to classification_report_cnn.txt")

    # export test predictions 
    pd.DataFrame({"y_true": labels, "y_pred": preds}).to_csv("cnn_preds.csv", index=False)
    print("Saved cnn_preds.csv")


In [None]:

main()


Found 7442 usable wav files.
Split sizes -> train: 5953 | val: 744 | test: 745


  xb, yb = xb.to(DEVICE), torch.tensor(yb).to(DEVICE)
  xb, yb = xb.to(DEVICE), torch.tensor(yb).to(DEVICE)


Epoch 01: train_loss=1.6069 acc=0.311 | val_loss=1.5597 acc=0.360
Epoch 02: train_loss=1.5212 acc=0.363 | val_loss=1.7022 acc=0.293
Epoch 03: train_loss=1.4967 acc=0.371 | val_loss=1.4973 acc=0.368
Epoch 04: train_loss=1.4908 acc=0.378 | val_loss=1.4192 acc=0.423
Epoch 05: train_loss=1.4619 acc=0.380 | val_loss=1.4355 acc=0.456
Epoch 06: train_loss=1.4405 acc=0.401 | val_loss=1.3955 acc=0.409
Epoch 07: train_loss=1.4167 acc=0.414 | val_loss=1.4663 acc=0.398
Epoch 08: train_loss=1.4022 acc=0.421 | val_loss=1.3734 acc=0.442
Epoch 09: train_loss=1.3825 acc=0.424 | val_loss=1.5394 acc=0.395
Epoch 10: train_loss=1.3848 acc=0.431 | val_loss=1.3919 acc=0.383
Epoch 11: train_loss=1.3758 acc=0.440 | val_loss=1.4428 acc=0.402
Epoch 12: train_loss=1.3700 acc=0.437 | val_loss=1.5605 acc=0.379
Epoch 13: train_loss=1.3552 acc=0.447 | val_loss=1.2635 acc=0.491
Epoch 14: train_loss=1.3465 acc=0.454 | val_loss=1.3451 acc=0.438
Epoch 15: train_loss=1.3415 acc=0.462 | val_loss=1.2612 acc=0.515
Epoch 16: 