# Speech Emotion Recognition (SER) 


## Dataset downloading from kagglehub

In [1]:
import kagglehub

# Download (cached automatically by kagglehub)
ravdess_path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")
crema_path   = kagglehub.dataset_download("ejlok1/cremad")
meld_path    = kagglehub.dataset_download("brij041/meld-with-audio-files")

print("RAVDESS:", ravdess_path)
print("CREMA-D:", crema_path)
print("MELD:", meld_path)


  from .autonotebook import tqdm as notebook_tqdm


RAVDESS: /home/d8a0/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1
CREMA-D: /home/d8a0/.cache/kagglehub/datasets/ejlok1/cremad/versions/1
MELD: /home/d8a0/.cache/kagglehub/datasets/brij041/meld-with-audio-files/versions/1


In [2]:
from pathlib import Path
import random
import pandas as pd

AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")

def inspect_dataset(root):
    root = Path(root)
    audio_files = [p for p in root.rglob("*") if p.suffix.lower() in AUDIO_EXTS]
    csv_files = list(root.rglob("*.csv"))

    print("\n" + "="*90)
    print("ROOT:", root)
    print("Audio files:", len(audio_files))
    print("CSV files:", len(csv_files))

    if audio_files:
        sample = random.sample(audio_files, k=min(8, len(audio_files)))
        print("\nSample audio paths:")
        for p in sample:
            print(" -", p.relative_to(root))

    if csv_files:
        print("\nSample CSV paths:")
        for p in csv_files[:8]:
            print(" -", p.relative_to(root))

    return audio_files, csv_files

rav_audio, _ = inspect_dataset(ravdess_path)
cre_audio, _ = inspect_dataset(crema_path)
mel_audio, mel_csv = inspect_dataset(meld_path)



ROOT: /home/d8a0/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1
Audio files: 2880
CSV files: 0

Sample audio paths:
 - Actor_23/03-01-03-01-02-01-23.wav
 - audio_speech_actors_01-24/Actor_22/03-01-07-01-01-02-22.wav
 - Actor_22/03-01-03-01-02-01-22.wav
 - Actor_08/03-01-03-02-02-02-08.wav
 - Actor_20/03-01-04-02-02-01-20.wav
 - audio_speech_actors_01-24/Actor_15/03-01-02-02-01-02-15.wav
 - Actor_19/03-01-07-02-01-02-19.wav
 - audio_speech_actors_01-24/Actor_02/03-01-02-01-01-01-02.wav

ROOT: /home/d8a0/.cache/kagglehub/datasets/ejlok1/cremad/versions/1
Audio files: 7442
CSV files: 0

Sample audio paths:
 - AudioWAV/1024_MTI_NEU_XX.wav
 - AudioWAV/1036_IWL_DIS_XX.wav
 - AudioWAV/1017_TIE_DIS_XX.wav
 - AudioWAV/1078_MTI_FEA_XX.wav
 - AudioWAV/1081_ITH_FEA_XX.wav
 - AudioWAV/1060_IWL_ANG_XX.wav
 - AudioWAV/1054_ITH_HAP_XX.wav
 - AudioWAV/1075_IEO_FEA_MD.wav

ROOT: /home/d8a0/.cache/kagglehub/datasets/brij041/meld-with-audio-files/versions/1
Audio files: 1

## Define labels


In [3]:
# Global label set used everywhere (training + mic adaptation + inference)
LABELS = ["angry", "fear", "happy", "neutral", "sad", "surprise"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}


## Deduplicate by filename

In [4]:
from collections import defaultdict

def duplicate_report_by_filename(files):
    fn2paths = defaultdict(list)
    for p in files:
        fn2paths[p.name].append(p)

    total = len(files)
    unique = len(fn2paths)
    dup_rows = sum(len(v) for v in fn2paths.values() if len(v) > 1)

    example = None
    for fn, paths in fn2paths.items():
        if len(paths) > 1:
            example = (fn, [str(p) for p in paths[:5]])
            break

    return total, unique, dup_rows, example

def dedup_by_filename(files, prefer_substrings=None):
    prefer_substrings = prefer_substrings or []
    # choose preferred paths first, then keep first occurrence per filename
    def pref_score(p):
        s = str(p)
        for i, sub in enumerate(prefer_substrings):
            if sub in s:
                return i
        return len(prefer_substrings) + 1

    files_sorted = sorted(files, key=pref_score)
    seen = set()
    kept = []
    for p in files_sorted:
        if p.name in seen:
            continue
        seen.add(p.name)
        kept.append(p)
    return kept

rav_total, rav_unique, rav_dup, rav_ex = duplicate_report_by_filename(rav_audio)
cre_total, cre_unique, cre_dup, cre_ex = duplicate_report_by_filename(cre_audio)

print("RAVDESS: total", rav_total, "| unique filenames", rav_unique, "| duplicate rows", rav_dup)
if rav_ex:
    print("Example duplicate filename:", rav_ex[0])
    for s in rav_ex[1]:
        print("  -", s)

print("\nCREMA-D: total", cre_total, "| unique filenames", cre_unique, "| duplicate rows", cre_dup)
if cre_ex:
    print("Example duplicate filename:", cre_ex[0])
    for s in cre_ex[1]:
        print("  -", s)

# Deduplicate (keep one copy per filename)
rav_audio_unique = dedup_by_filename(rav_audio, prefer_substrings=["audio_speech_actors_01-24"])
cre_audio_unique = dedup_by_filename(cre_audio, prefer_substrings=["AudioWAV"])

print("\nAfter dedup -> RAVDESS:", len(rav_audio_unique), "files")
print("After dedup -> CREMA-D:", len(cre_audio_unique), "files")


RAVDESS: total 2880 | unique filenames 1440 | duplicate rows 2880
Example duplicate filename: 03-01-07-02-02-02-03.wav
  - /home/d8a0/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/Actor_03/03-01-07-02-02-02-03.wav
  - /home/d8a0/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/audio_speech_actors_01-24/Actor_03/03-01-07-02-02-02-03.wav

CREMA-D: total 7442 | unique filenames 7442 | duplicate rows 0

After dedup -> RAVDESS: 1440 files
After dedup -> CREMA-D: 7442 files


## Build manifests (RAVDESS + CREMA-D) from filenames

In [5]:
import pandas as pd

TARGET = {"happy", "sad", "angry", "neutral", "surprise", "fear"}

# RAVDESS filename: MM-VC-EM-INT-STAT-REP-ACTOR.wav
# EM: 01 neutral, 02 calm, 03 happy, 04 sad, 05 angry, 06 fearful, 07 disgust, 08 surprised
RAV_EMO = {
    "01": "neutral",
    "02": "neutral",   # calm -> neutral
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fear",
    "07": None,        # disgust -> drop
    "08": "surprise",
}

# CREMA-D filename: ActorID_SentenceID_Emotion_Level.wav (e.g., 1053_IEO_ANG_HI.wav)
CREMA_EMO = {
    "ANG": "angry",
    "HAP": "happy",
    "SAD": "sad",
    "FEA": "fear",
    "NEU": "neutral",
    "DIS": None,       # drop
}

def build_ravdess_manifest_from_files(files):
    rows = []
    for wav in files:
        parts = wav.stem.split("-")
        if len(parts) != 7:
            continue
        emo_code = parts[2]
        actor_id = parts[6]
        label = RAV_EMO.get(emo_code, None)
        if label is None or label not in TARGET:
            continue
        rows.append({
            "path": str(wav),
            "dataset": "ravdess",
            "speaker_id": f"rav_actor_{actor_id}",
            "raw_label": emo_code,
            "label": label
        })
    return pd.DataFrame(rows)

def build_crema_manifest_from_files(files):
    rows = []
    for wav in files:
        parts = wav.stem.split("_")
        if len(parts) < 3:
            continue
        actor_id = parts[0]
        emo_code = parts[2]
        label = CREMA_EMO.get(emo_code, None)
        if label is None or label not in TARGET:
            continue
        rows.append({
            "path": str(wav),
            "dataset": "crema_d",
            "speaker_id": f"cre_actor_{actor_id}",
            "raw_label": emo_code,
            "label": label
        })
    return pd.DataFrame(rows)

rav_df = build_ravdess_manifest_from_files(rav_audio_unique)
cre_df = build_crema_manifest_from_files(cre_audio_unique)

print("RAVDESS kept:", len(rav_df))
print(rav_df["label"].value_counts())

print("\nCREMA-D kept:", len(cre_df))
print(cre_df["label"].value_counts())


RAVDESS kept: 1248
label
neutral     288
fear        192
surprise    192
angry       192
sad         192
happy       192
Name: count, dtype: int64

CREMA-D kept: 6171
label
happy      1271
fear       1271
angry      1271
sad        1271
neutral    1087
Name: count, dtype: int64


## Build MELD manifest (labels from CSV + audio from `audio/{train,dev,test}`)

### MELD audio files are named like: `dia{Dialogue_ID}_utt{Utterance_ID}.wav`.

In [6]:
from pathlib import Path
import pandas as pd

meld_root = Path(meld_path) / "meld-dataset" / "MELD-RAW" / "MELD.Raw"

train_csv = meld_root / "train" / "train_sent_emo.csv"
dev_csv   = meld_root / "dev_sent_emo.csv"
test_csv  = meld_root / "test_sent_emo.csv"

audio_train = meld_root / "audio" / "train"
audio_dev   = meld_root / "audio" / "dev"
audio_test  = meld_root / "audio" / "test"

MELD_MAP = {
    "anger": "angry",
    "sadness": "sad",
    "neutral": "neutral",
    "surprise": "surprise",
    "fear": "fear",
    "joy": "happy",
    "disgust": None,   # drop
}

def build_meld_manifest(csv_path: Path, audio_dir: Path, split_name: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    rows = []
    missing_audio = 0
    dropped_label = 0

    for _, r in df.iterrows():
        raw = str(r["Emotion"]).strip().lower()
        mapped = MELD_MAP.get(raw, None)
        if mapped is None:
            dropped_label += 1
            continue

        did = int(r["Dialogue_ID"])
        uid = int(r["Utterance_ID"])
        fname = f"dia{did}_utt{uid}.wav"
        fpath = audio_dir / fname

        if not fpath.exists():
            missing_audio += 1
            continue

        speaker = str(r["Speaker"]).strip()

        rows.append({
            "path": str(fpath),
            "dataset": "meld",
            "speaker_id": f"meld_{speaker}",
            "raw_label": raw,
            "label": mapped,
            "split_original": split_name,
            "dialogue_id": did,
            "utterance_id": uid
        })

    out = pd.DataFrame(rows)

    print(f"\n[{split_name}] total CSV rows: {len(df)}")
    print(f"[{split_name}] kept: {len(out)} | missing_audio: {missing_audio} | dropped(not in 6): {dropped_label}")
    if len(out) > 0:
        print(f"[{split_name}] label counts:\n", out["label"].value_counts())
    else:
        print(f"[{split_name}] WARNING: kept 0 rows. Check paths.")

    return out

meld_train_df = build_meld_manifest(train_csv, audio_train, "train")
meld_dev_df   = build_meld_manifest(dev_csv, audio_dev, "dev")
meld_test_df  = build_meld_manifest(test_csv, audio_test, "test")

meld_df = pd.concat([meld_train_df, meld_dev_df, meld_test_df], ignore_index=True)
print("\nTOTAL MELD kept:", len(meld_df))



[train] total CSV rows: 9989
[train] kept: 9718 | missing_audio: 0 | dropped(not in 6): 271
[train] label counts:
 label
neutral     4710
happy       1743
surprise    1205
angry       1109
sad          683
fear         268
Name: count, dtype: int64

[dev] total CSV rows: 1109
[dev] kept: 1086 | missing_audio: 1 | dropped(not in 6): 22
[dev] label counts:
 label
neutral     469
happy       163
angry       153
surprise    150
sad         111
fear         40
Name: count, dtype: int64

[test] total CSV rows: 2610
[test] kept: 2542 | missing_audio: 0 | dropped(not in 6): 68
[test] label counts:
 label
neutral     1256
happy        402
angry        345
surprise     281
sad          208
fear          50
Name: count, dtype: int64

TOTAL MELD kept: 13346


## Save individual manifests

In [7]:
from pathlib import Path

out_dir = Path("manifests")
out_dir.mkdir(exist_ok=True)

rav_df.to_csv(out_dir / "ravdess_manifest.csv", index=False)
cre_df.to_csv(out_dir / "crema_manifest.csv", index=False)
meld_df.to_csv(out_dir / "meld_manifest.csv", index=False)

print("Saved:")
print(" -", out_dir / "ravdess_manifest.csv")
print(" -", out_dir / "crema_manifest.csv")
print(" -", out_dir / "meld_manifest.csv")


Saved:
 - manifests/ravdess_manifest.csv
 - manifests/crema_manifest.csv
 - manifests/meld_manifest.csv


## Combine all three, then split train/val/test by **speaker groups** (random ratios)

### This guarantees **no speaker overlap** across splits *after combining*.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

SEED = 42
rng = np.random.default_rng(SEED)

# Combine (keep only required columns)
keep_cols = ["path", "dataset", "speaker_id", "raw_label", "label"]
all_df = pd.concat([
    rav_df[keep_cols],
    cre_df[keep_cols],
    meld_df[keep_cols]
], ignore_index=True).drop_duplicates("path").reset_index(drop=True)

# Make speaker IDs globally unique across datasets
all_df["group_speaker"] = all_df["dataset"].astype(str) + "__" + all_df["speaker_id"].astype(str)

# Random speaker-level split by ratios
train_ratio, val_ratio, test_ratio = 0.80, 0.10, 0.10
speakers = all_df["group_speaker"].unique().tolist()
rng.shuffle(speakers)

n = len(speakers)
n_train = int(round(train_ratio * n))
n_val   = int(round(val_ratio * n))

train_sp = set(speakers[:n_train])
val_sp   = set(speakers[n_train:n_train+n_val])
test_sp  = set(speakers[n_train+n_val:])

def assign_split(gs):
    if gs in train_sp: return "train"
    if gs in val_sp:   return "val"
    return "test"

all_df["split"] = all_df["group_speaker"].map(assign_split)

print("Split counts:\n", all_df["split"].value_counts())
print("\nLabel counts:\n", all_df["label"].value_counts())

# Leak checks 
print("\nSpeaker overlap checks (must be 0):")
print("train∩val :", len(train_sp & val_sp))
print("train∩test:", len(train_sp & test_sp))
print("val∩test  :", len(val_sp & test_sp))

# Save final
out_dir = Path("manifests"); out_dir.mkdir(exist_ok=True)
final_path = out_dir / "final_manifest_speaker_disjoint_ratio.csv"
all_df.to_csv(final_path, index=False)
print("\n Saved:", final_path)


Split counts:
 split
train    16529
val       3077
test      1159
Name: count, dtype: int64

Label counts:
 label
neutral     7810
happy       3771
angry       3070
sad         2465
surprise    1828
fear        1821
Name: count, dtype: int64

Speaker overlap checks (must be 0):
train∩val : 0
train∩test: 0
val∩test  : 0

 Saved: manifests/final_manifest_speaker_disjoint_ratio.csv


## Check for imbalance per split

In [None]:
import pandas as pd

final_df = pd.read_csv("manifests/final_manifest_speaker_disjoint_ratio.csv")

ct = pd.crosstab(final_df["split"], final_df["label"])
print(ct)

print("\nPercent per split:")
print((ct.div(ct.sum(axis=1), axis=0) * 100).round(2))

# Baseline "always neutral" accuracy per split 
for sp in ["train","val","test"]:
    sub = final_df[final_df["split"]==sp]
    neutral_rate = (sub["label"]=="neutral").mean()
    print(f"{sp}: neutral-rate baseline accuracy = {neutral_rate:.3f}")


label  angry  fear  happy  neutral   sad  surprise
split                                             
test     205   188    208      291   199        68
train   2438  1401   2991     6282  1954      1463
val      427   232    572     1237   312       297

Percent per split:
label  angry   fear  happy  neutral    sad  surprise
split                                               
test   17.69  16.22  17.95    25.11  17.17      5.87
train  14.75   8.48  18.10    38.01  11.82      8.85
val    13.88   7.54  18.59    40.20  10.14      9.65
train: neutral-rate baseline accuracy = 0.380
val: neutral-rate baseline accuracy = 0.402
test: neutral-rate baseline accuracy = 0.251


## Weighted imbalance classes

In [None]:
import numpy as np

train_df = final_df[final_df["split"]=="train"]
labels = sorted(train_df["label"].unique())
counts = train_df["label"].value_counts().reindex(labels).values

# Inverse-frequency weights 
weights = counts.sum() / (len(labels) * counts)

label2id_tmp = {lab:i for i, lab in enumerate(labels)}
print("labels:", labels)
print("counts:", dict(zip(labels, counts)))
print("weights:", dict(zip(labels, weights.round(3))))


labels: ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']
counts: {'angry': 2438, 'fear': 1401, 'happy': 2991, 'neutral': 6282, 'sad': 1954, 'surprise': 1463}
weights: {'angry': 1.13, 'fear': 1.966, 'happy': 0.921, 'neutral': 0.439, 'sad': 1.41, 'surprise': 1.883}


In [11]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
    }


## WavLM training setup (weighted loss)

In [12]:
import datasets, sys
print("datasets module:", datasets)
print("datasets __file__:", getattr(datasets, "__file__", None))
print("sys.path[0]:", sys.path[0])


datasets module: <module 'datasets' from '/home/d8a0/AI_Project/env/lib/python3.12/site-packages/datasets/__init__.py'>
datasets __file__: /home/d8a0/AI_Project/env/lib/python3.12/site-packages/datasets/__init__.py
sys.path[0]: 


In [None]:
import contextlib
import numpy as np
if not hasattr(np, "_no_nep50_warning"):
    np._no_nep50_warning = contextlib.nullcontext
import inspect
import pandas as pd
import torch
import torchaudio
from torch import nn
from datasets import Dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer

assert torch.cuda.is_available(), "CUDA not available. You said you're using GPU—fix that first."

kw = inspect.signature(TrainingArguments.__init__).parameters
eval_key = "eval_strategy" if "eval_strategy" in kw else "evaluation_strategy"

# 1) Load manifest
df = pd.read_csv("manifests/final_manifest_speaker_disjoint_ratio.csv")

labels = LABELS  # use global label order
df["label_id"] = df["label"].map(label2id).astype(int)

train_df = df[df["split"]=="train"].reset_index(drop=True)
val_df   = df[df["split"]=="val"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)

# 2) Class weights from TRAIN only
counts = train_df["label"].value_counts().reindex(labels, fill_value=0).values
counts = np.maximum(counts, 1)  # avoid divide-by-zero
class_weights = (counts.sum() / (len(labels) * counts)).astype(np.float32)
class_weights_t = torch.tensor(class_weights)

print("Train counts:", dict(zip(labels, counts)))
print("Class weights:", dict(zip(labels, class_weights.round(3))))

# 3) HF datasets (paths only)
train_ds_base = Dataset.from_pandas(train_df[["path","label_id"]])
val_ds_base   = Dataset.from_pandas(val_df[["path","label_id"]])
test_ds_base  = Dataset.from_pandas(test_df[["path","label_id"]])

MODEL_NAME = "microsoft/wavlm-base"
feat = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(
    MODEL_NAME, num_labels=len(labels), label2id=label2id, id2label=id2label
)

TARGET_SR = 16000
MAX_SECONDS = 6
MAX_LEN = TARGET_SR * MAX_SECONDS

def load_and_resample(path: str):
    wav, sr = torchaudio.load(path)  # [C,T]
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    wav = wav.squeeze(0)
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
    if wav.numel() > MAX_LEN:
        wav = wav[:MAX_LEN]
    return wav.numpy()

# Collator loads audio per batch 
def collate_fn_base(features):
    audios = [load_and_resample(f["path"]) for f in features]
    y = torch.tensor([f["label_id"] for f in features], dtype=torch.long)
    inputs = feat(
        audios,
        sampling_rate=TARGET_SR,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    inputs["labels"] = y
    return inputs

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        y = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights_t.to(outputs.logits.device))
        loss = loss_fn(outputs.logits, y)
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir="wavlm_ser_ckpt",
    **{eval_key: "epoch"},
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    fp16=True,                 
    dataloader_num_workers=4,  # speed up CPU audio loading
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    remove_unused_columns=False,  # IMPORTANT: keep "path"/"label_id" for collator
    report_to="none",
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds_base,
    eval_dataset=val_ds_base,
    data_collator=collate_fn_base,
    compute_metrics=compute_metrics
)

trainer.train()


Train counts: {'angry': 2438, 'fear': 1401, 'happy': 2991, 'neutral': 6282, 'sad': 1954, 'surprise': 1463}
Class weights: {'angry': 1.13, 'fear': 1.966, 'happy': 0.921, 'neutral': 0.439, 'sad': 1.41, 'surprise': 1.883}


Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.3683,1.370193,0.533962,0.485319
2,1.1901,1.289811,0.501137,0.487298
3,1.0973,1.209045,0.567436,0.555547
4,1.0426,1.250443,0.546961,0.537144
5,0.9285,1.277703,0.561911,0.545677




TrainOutput(global_step=10335, training_loss=1.1697039851025792, metrics={'train_runtime': 1789.3381, 'train_samples_per_second': 46.187, 'train_steps_per_second': 5.776, 'total_flos': 3.8163157817943e+18, 'train_loss': 1.1697039851025792, 'epoch': 5.0})

## Save model

In [15]:
import os, json

BASE_DIR = "wavlm_ser_base"
os.makedirs(BASE_DIR, exist_ok=True)

trainer.save_model(BASE_DIR)
feat.save_pretrained(BASE_DIR)
with open(os.path.join(BASE_DIR, "labels.json"), "w", encoding="utf-8") as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)

print("Saved BASE model to:", os.path.abspath(BASE_DIR))


Saved BASE model to: /home/d8a0/AI_Project/wavlm_ser_base


## Point to 60 labeled own dataset files and build a small manifest

In [None]:
from pathlib import Path
import pandas as pd

MY_DATA_DIR = Path(r"./my_audio_16k")  

# Exact label set 
label_set = set(LABELS)

def parse_label_and_speaker(p: Path):
    stem = p.stem.lower()  # e.g., "dog_angry_u"
    tokens = stem.split("_")

    # label is one of the tokens
    lab = next((t for t in tokens if t in label_set), None)

    # speaker: "_u" means colleague, else "me"
    speaker = "colleague" if "u" in tokens else "me"

    return lab, speaker

rows = []
for p in MY_DATA_DIR.glob("*.wav"):
    lab, speaker = parse_label_and_speaker(p)
    if lab is None:
        continue
    rows.append({"path": str(p), "label": lab, "speaker": speaker})

df_adapt = pd.DataFrame(rows)

print("Folder:", MY_DATA_DIR.resolve())
print("Found labeled wav files:", len(df_adapt))
print("\nLabel counts:\n", df_adapt["label"].value_counts() if len(df_adapt) else "NONE")
print("\nSpeaker counts:\n", df_adapt["speaker"].value_counts() if len(df_adapt) else "NONE")

# show unlabeled files if any
all_wavs = list(MY_DATA_DIR.glob("*.wav"))
unlabeled = [p.name for p in all_wavs if parse_label_and_speaker(p)[0] is None]
if unlabeled:
    print("\n These files had no detectable label token:", unlabeled[:20], ("..." if len(unlabeled)>20 else ""))


Folder: /home/d8a0/AI_Project/my_audio_16k
Found labeled wav files: 60

Label counts:
 label
fear        10
surprise    10
neutral     10
happy       10
angry       10
sad         10
Name: count, dtype: int64

Speaker counts:
 speaker
colleague    30
me           30
Name: count, dtype: int64


## Split train/val by speaker

In [17]:
train_df_mic = df_adapt[df_adapt["speaker"]=="me"].reset_index(drop=True)
val_df_mic   = df_adapt[df_adapt["speaker"]=="colleague"].reset_index(drop=True)

print("Train size:", len(train_df_mic), "| Val size:", len(val_df_mic))
print("\nTrain label counts:\n", train_df_mic["label"].value_counts())
print("\nVal label counts:\n", val_df_mic["label"].value_counts())


Train size: 30 | Val size: 30

Train label counts:
 label
fear        5
surprise    5
happy       5
neutral     5
angry       5
sad         5
Name: count, dtype: int64

Val label counts:
 label
fear        5
surprise    5
neutral     5
angry       5
happy       5
sad         5
Name: count, dtype: int64


## Create a tiny dataset loader with light augmentation


In [18]:
import numpy as np
import torch
import torchaudio
from torch.utils.data import Dataset

TARGET_SR = 16000
MAX_SECONDS = 6.0
MAX_LEN = int(TARGET_SR * MAX_SECONDS)


def load_wav(path):
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    wav = wav.squeeze(0)
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
    return wav

def random_crop(wav):
    if wav.numel() <= MAX_LEN:
        return wav
    start = torch.randint(0, wav.numel() - MAX_LEN + 1, (1,)).item()
    return wav[start:start+MAX_LEN]

def augment(wav):
    # random gain + tiny noise
    gain = 10 ** (torch.empty(1).uniform_(-0.3, 0.3).item())
    wav = wav * gain
    noise_level = torch.empty(1).uniform_(0.0, 0.01).item()
    wav = wav + noise_level * torch.randn_like(wav)
    return torch.clamp(wav, -1.0, 1.0)

class AdaptDS(Dataset):
    def __init__(self, df, train=True):
        self.df = df
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.loc[idx, "path"]
        y = label2id[self.df.loc[idx, "label"]]

        wav = load_wav(path)
        if self.train:
            wav = random_crop(wav)
            wav = augment(wav)
        else:
            if wav.numel() > MAX_LEN:
                wav = wav[-MAX_LEN:]

        return {"audio": wav.numpy().astype(np.float32), "label_id": y}

train_ds_mic = AdaptDS(train_df_mic, train=True)
val_ds_mic = AdaptDS(val_df_mic, train=False)


## Freeze everything except classifier head and fine-tune with early stopping

In [19]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from torch import nn
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
import inspect

kw = inspect.signature(TrainingArguments.__init__).parameters
eval_key = "eval_strategy" if "eval_strategy" in kw else "evaluation_strategy"

# Freeze everything
for p in model.parameters():
    p.requires_grad = False

# Unfreeze classifier only
trainable = []
for name, p in model.named_parameters():
    if "classifier" in name:
        p.requires_grad = True
        trainable.append(name)

print("Trainable params:", trainable)

def collate_fn_mic(batch):
    audios = [b["audio"] for b in batch]
    y = torch.tensor([b["label_id"] for b in batch], dtype=torch.long)
    inputs = feat(audios, sampling_rate=TARGET_SR, padding=True, return_tensors="pt")
    inputs["labels"] = y
    return inputs

loss_fn = nn.CrossEntropyLoss()

def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average="macro"),
    }

class HeadOnlyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        y = inputs.pop("labels")
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, y)
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir="wavlm_head_adapt_ckpt",
    **{eval_key: "epoch"},
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=5e-5,
    weight_decay=0.05,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=30,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    remove_unused_columns=False,
    report_to="none",
)

trainer_adapt = HeadOnlyTrainer(
    model=model,
    args=args,
    train_dataset=train_ds_mic,
    eval_dataset=val_ds_mic,
    data_collator=collate_fn_mic,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer_adapt.train()
print("Val metrics:", trainer_adapt.evaluate())


Trainable params: ['classifier.weight', 'classifier.bias']


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.335392,0.633333,0.577941
2,No log,1.330691,0.633333,0.577941
3,No log,1.327073,0.633333,0.577941
4,No log,1.324772,0.633333,0.577941




Val metrics: {'eval_loss': 1.3353922367095947, 'eval_accuracy': 0.6333333333333333, 'eval_macro_f1': 0.5779405779405778, 'eval_runtime': 0.3555, 'eval_samples_per_second': 84.392, 'eval_steps_per_second': 22.505, 'epoch': 4.0}


## Save the adapted model

In [None]:
import os, json
import torch

SAVE_DIR = "wavlm_ser_model"  # final model for live testing
PTH_PATH = os.path.abspath("wavlm_ser_model.pth")

os.makedirs(SAVE_DIR, exist_ok=True)

# After trainer_adapt.train(), trainer_adapt.model is the final (often best) model
trainer_adapt.save_model(SAVE_DIR)     # saves config + weights
feat.save_pretrained(SAVE_DIR)         

# Save FINAL adapted model weights as .pth (state_dict)
torch.save(trainer_adapt.model.state_dict(), PTH_PATH)

with open(os.path.join(SAVE_DIR, "labels.json"), "w", encoding="utf-8") as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)

print("Saved FINAL adapted model to:", os.path.abspath(SAVE_DIR))
print("Saved .pth to:", os.path.abspath(PTH_PATH))
print("Best checkpoint:", getattr(trainer_adapt.state, "best_model_checkpoint", None))


Saved FINAL adapted model to: /home/d8a0/AI_Project/wavlm_ser_model
Saved .pth to: /home/d8a0/AI_Project/wavlm_ser_model.pth
Best checkpoint: wavlm_head_adapt_ckpt/checkpoint-8


## Test model

In [21]:
trainer.evaluate(test_ds_base)




{'eval_loss': 0.9535818696022034,
 'eval_accuracy': 0.6764452113891286,
 'eval_macro_f1': 0.6635750220038531,
 'eval_runtime': 6.6772,
 'eval_samples_per_second': 173.576,
 'eval_steps_per_second': 21.716,
 'epoch': 5.0}

In [22]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(test_ds_base)
y_true = pred.label_ids
y_pred = np.argmax(pred.predictions, axis=1)

print(classification_report(y_true, y_pred, target_names=labels, digits=3))
print(confusion_matrix(y_true, y_pred))




              precision    recall  f1-score   support

       angry      0.639     0.854     0.731       205
        fear      0.708     0.734     0.721       188
       happy      0.635     0.644     0.640       208
     neutral      0.774     0.601     0.677       291
         sad      0.674     0.613     0.642       199
    surprise      0.556     0.588     0.571        68

    accuracy                          0.676      1159
   macro avg      0.664     0.672     0.664      1159
weighted avg      0.684     0.676     0.675      1159

[[175   4  16   5   3   2]
 [  4 138  15   4  26   1]
 [ 40  10 134  11   5   8]
 [ 46   5  21 175  24  20]
 [  4  34  11  27 122   1]
 [  5   4  14   4   1  40]]
