In [1]:
import os
from data.source.pg_experiment import get_pg_experiment_dataframe
import polars as pl

from models.SimplifiedLightweightCNN import SimplifiedLightweightCNN
%load_ext autoreload
%autoreload 1
%aimport models.SimplifiedLightweightCNN
from models.SimpleCNN_v2 import train, evaluate
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from path import RESULT_DIRECTORY
import wandb

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
df_pron, df_tone = get_pg_experiment_dataframe(".ogg")

# Get the target words with val accuracy above 70%
TARGET_WORDS = ["a0", "a1", "a100", "a2", "a3", "a5", "a8"]
dataframe = df_pron.filter(pl.col("word_id").is_in(TARGET_WORDS))

dataframe = dataframe.with_columns([
    pl.struct("word_id").rank("dense").alias("word_id"),
    pl.col("value").cast(pl.Float32) 
])

# Filters
dataframe = dataframe.filter((pl.col("stage") == 1))

N_WORDS = dataframe.select(pl.col("word_id").n_unique()).item()
print(f"Number of unique words: {N_WORDS}")
print(f"Number of samples: {dataframe.shape[0]}")


Number of unique words: 7
Number of samples: 3527


In [3]:
import polars as pl
import numpy as np
from typing import Tuple

def stratified_split(df: pl.DataFrame, label_col: str, train_frac=0.8, val_frac=0.1, seed=42) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
    classes = df.select(label_col).unique().to_series()
    train_rows, val_rows, test_rows = [], [], []

    rng = np.random.RandomState(seed)

    for cls in classes:
        class_df = df.filter(pl.col(label_col) == cls)
        n = class_df.height
        indices = rng.permutation(n)

        train_end = int(train_frac * n)
        val_end = int((train_frac + val_frac) * n)

        train_rows.append(class_df[indices[:train_end]])
        val_rows.append(class_df[indices[train_end:val_end]])
        test_rows.append(class_df[indices[val_end:]])

    train_df = pl.concat(train_rows)
    val_df = pl.concat(val_rows)
    test_df = pl.concat(test_rows)

    return train_df, val_df, test_df


In [4]:
from typing import Callable

from polars import DataFrame
from dataset import Cast, TorchDataset
from develop import reload_function, reload_module
import pytorch_dataloader
reload_module(pytorch_dataloader)
from pytorch_dataloader import ReshapeCollate, build_collate_fn, PaddingCollate, DefaultCollate
from functools import partial

from transformation import Channels, RMSEnergy, TorchVadLogMelSpec, TorchVadMFCC, ZeroCrossingRate

reload_function(TorchVadMFCC)

TRAIN_SPLIT = 0.6
VAL_SPLIT = 0.2
TEST_SPLIT = 1 - TRAIN_SPLIT - VAL_SPLIT
train_pl, val_pl, test_pl = stratified_split(dataframe, label_col="value", train_frac=TRAIN_SPLIT, val_frac=VAL_SPLIT)

to_dataset: Callable[[DataFrame], TorchDataset] = lambda dataframe: TorchDataset(
    Cast(dataframe.get_column("rec_path"), Channels("stack","multiply")(
            TorchVadMFCC(delta=0),
        )),
    Cast(dataframe.get_column("rec_path"), Channels("cat","multiply")(
            ZeroCrossingRate(),
            RMSEnergy(),
        )),
    Cast(dataframe.get_column("word_id"), lambda x: torch.tensor(x-1, dtype=torch.long)),
    Cast(dataframe.get_column("value"), lambda x: torch.tensor(x).float()),
)

collate_fn = build_collate_fn(
    PaddingCollate(mode="SET_MAX_LEN", max_len=80, pad_dim=2),
    PaddingCollate(mode="SET_MAX_LEN", max_len=160, pad_dim=1),
    DefaultCollate(),
    DefaultCollate(),
)
dataset_train = to_dataset(train_pl)
dataset_val = to_dataset(val_pl)
dataset_test = to_dataset(test_pl)

In [5]:
from pytorch_dataloader import MemoryLoadedDataLoader
from os import name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#note, if you are using Windows you MUST set `num_workers=0` - TL;DT multithreading DON'T work in notebooks because Windows DON'T have `fork()`
num_workers = 0 if name == "nt" else 4
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
val_loader = DataLoader(dataset_val, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

for x in next(iter(train_loader)):
    print(x.shape)

/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/774/a5.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/484/a0.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/589/a8.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/523/a1.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/1059/a2.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/1536/a2.ogg has no speech segments, using full waveform
torch.Size([16, 1, 40, 80])
torch.Size([16, 2, 160])
torch.Size([16]

In [6]:
train_loader = MemoryLoadedDataLoader(train_loader, device=device)
print("Loaded train loader into memory")
val_loader = MemoryLoadedDataLoader(val_loader, device=device)
print("Loaded validation loader into memory")

/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/772/a5.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/484/a100.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/589/a8.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/587/a1.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/1536/a2.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/345/a2.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/sou

In [10]:
from models.FusionCNN import ContextFusionCNN
reload_function(ContextFusionCNN)
model = ContextFusionCNN(1,2, num_words=N_WORDS)

In [11]:
# Model variables definition.
pth = "ContextFusionCNN.pth"
lr = 1e-4  # Reduce from 1e-3
epochs = 140
model = model.to(device)
reload_function(train)
reload_function(evaluate)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)  # Add L2 regularization

# Add learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)
criterion = nn.BCELoss()

# Training loop
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    # Update learning rate
    scheduler.step(val_loss)
    
    print(
        f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Saving the model locally
torch.save(model.state_dict(), os.path.join(RESULT_DIRECTORY, pth))

Epoch 1, Train Loss: 0.6862, Train Acc: 0.5641, Val Loss: 0.6831, Val Acc: 0.5623
Epoch 2, Train Loss: 0.6818, Train Acc: 0.5740, Val Loss: 0.6773, Val Acc: 0.5637
Epoch 3, Train Loss: 0.6775, Train Acc: 0.5764, Val Loss: 0.6711, Val Acc: 0.5737
Epoch 4, Train Loss: 0.6719, Train Acc: 0.5745, Val Loss: 0.6636, Val Acc: 0.5807
Epoch 5, Train Loss: 0.6689, Train Acc: 0.5943, Val Loss: 0.6575, Val Acc: 0.5992
Epoch 6, Train Loss: 0.6649, Train Acc: 0.6080, Val Loss: 0.6494, Val Acc: 0.6643
Epoch 7, Train Loss: 0.6595, Train Acc: 0.6090, Val Loss: 0.6431, Val Acc: 0.6969
Epoch 8, Train Loss: 0.6576, Train Acc: 0.6232, Val Loss: 0.6373, Val Acc: 0.7082
Epoch 9, Train Loss: 0.6567, Train Acc: 0.6161, Val Loss: 0.6288, Val Acc: 0.7096
Epoch 10, Train Loss: 0.6467, Train Acc: 0.6217, Val Loss: 0.6221, Val Acc: 0.7096
Epoch 11, Train Loss: 0.6444, Train Acc: 0.6378, Val Loss: 0.6154, Val Acc: 0.7181
Epoch 12, Train Loss: 0.6387, Train Acc: 0.6421, Val Loss: 0.6101, Val Acc: 0.7167
Epoch 13, Tra

In [None]:
test_loader = MemoryLoadedDataLoader(test_loader, device=device)
print("Loaded test loader into memory")

/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/493/a8.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/550/a100.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/620/a5.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/489/a5.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/772/a1.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/source/pg_dataset/recordings/stageI/668/a100.ogg has no speech segments, using full waveform
/home/kamil2002/Mandarin_Pronunciation_Recognition_Project/data/so

In [None]:
model.eval()
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print("-" * 30)
print(f"FINAL TEST RESULTS")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("-" * 30)

------------------------------
FINAL TEST RESULTS
Test Loss: 0.5561
Test Accuracy: 0.7195
------------------------------
