In [1]:
import os

import polars as pl

import src.models.SimplifiedLightweightCNN as model_module
%load_ext autoreload
%autoreload 1
%aimport src.models.SimplifiedLightweightCNN
from data.source.pg_experiment import get_pg_experiment_dataset, AUDIO_PATH
from models.SimpleCNN_v2 import PronunciationDataset, collate_fn, train, evaluate
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from path import RESULT_DIRECTORY
import wandb

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
df_pron, _ = get_pg_experiment_dataset(".wav")

# Word filtering
# word = "a0"  # The first word - zero
# df_pron = df_pron.filter(pl.col("word_id").eq(word))

df_stageI_polish = df_pron.filter(pl.col("word_id").str.starts_with("a") & pl.col("mother")
                                  .str.strip_chars()
                                  .str.to_lowercase()
                                  .is_in(["polish", "polski"]))

df_other_lang = df_pron.filter(
    pl.col("word_id").str.starts_with("a") &
    (~pl.col("mother")
     .str.strip_chars()
     .str.to_lowercase()
     .is_in(["polish", "polski"]))
)


In [3]:
import numpy as np

np.shape(df_stageI_polish)

(5732, 7)

In [4]:
count_missing = 0
rec_paths = df_other_lang.select("rec_path").to_series()

for path in rec_paths:
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        count_missing += 1

count_missing_pl = 0
rec_paths = df_stageI_polish.select("rec_path").to_series()

for path in rec_paths:
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        count_missing_pl += 1

print(f"Missing files in mother language Polish: {count_missing_pl}")
print(f"Missing files in mother language other than Polish: {count_missing}")

Missing files in mother language Polish: 0
Missing files in mother language other than Polish: 2


In [5]:
# Find indices of rows with missing files
rec_paths = df_other_lang.select("rec_path").to_series()
missing_indices = []

for i, path in enumerate(rec_paths):
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        missing_indices.append(i)

# Print the deleted rows
deleted_rows = df_other_lang[missing_indices]
print(deleted_rows)

df_other_lang = df_other_lang.filter(
    ~pl.Series(range(len(df_other_lang))).is_in(pl.Series(missing_indices).implode())
)



shape: (2, 7)
┌────────────┬───────┬─────────┬───────────────────┬───────────┬────────┬─────────┐
│ id_student ┆ value ┆ word_id ┆ rec_path          ┆ univ      ┆ gender ┆ mother  │
│ ---        ┆ ---   ┆ ---     ┆ ---               ┆ ---       ┆ ---    ┆ ---     │
│ i64        ┆ i64   ┆ str     ┆ str               ┆ str       ┆ str    ┆ str     │
╞════════════╪═══════╪═════════╪═══════════════════╪═══════════╪════════╪═════════╡
│ 757        ┆ 1     ┆ a0      ┆ stageI\757\a0.wav ┆ CLES_UMK2 ┆ f      ┆ chinese │
│ 757        ┆ 1     ┆ a1      ┆ stageI\757\a1.wav ┆ CLES_UMK2 ┆ f      ┆ chinese │
└────────────┴───────┴─────────┴───────────────────┴───────────┴────────┴─────────┘


In [6]:
import polars as pl

train_split = 0.6
val_split = 0.2
test_split = 1 - train_split - val_split


def stratified_split(df: pl.DataFrame, label_col: str, train_frac=train_split, val_frac=val_split, seed=42):
    classes = df.select(label_col).unique().to_series()
    train_rows, val_rows, test_rows = [], [], []

    rng = np.random.RandomState(seed)

    for cls in classes:
        class_df = df.filter(pl.col(label_col) == cls)
        n = class_df.height
        indices = rng.permutation(n)

        train_end = int(train_frac * n)
        val_end = int((train_frac + val_frac) * n)

        train_rows.append(class_df[indices[:train_end]])
        val_rows.append(class_df[indices[train_end:val_end]])
        test_rows.append(class_df[indices[val_end:]])

    train_df = pl.concat(train_rows)
    val_df = pl.concat(val_rows)
    test_df = pl.concat(test_rows)

    return train_df, val_df, test_df


In [7]:
# dataset = PronunciationDataset(df_stageI_polish, base_dir=AUDIO_PATH)
dataset_other = PronunciationDataset(df_other_lang, base_dir=AUDIO_PATH)

In [8]:
train_pl, val_pl, test_pl = stratified_split(df_stageI_polish, label_col="value", train_frac=0.6, val_frac=0.2)

dataset_train = PronunciationDataset(train_pl, base_dir=AUDIO_PATH)
dataset_val = PronunciationDataset(val_pl, base_dir=AUDIO_PATH)
dataset_test = PronunciationDataset(test_pl, base_dir=AUDIO_PATH)


In [9]:
from pytorch_dataloader import MemoryLoadedDataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True, collate_fn=collate_fn, num_workers=8)
val_loader = DataLoader(dataset_val, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers=8)
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False, collate_fn=collate_fn, num_workers=8)

train_loader = MemoryLoadedDataLoader(train_loader, device=device)
print("Loaded train loader into memory")
val_loader = MemoryLoadedDataLoader(val_loader, device=device)
print("Loaded validation loader into memory")

Loaded train loader into memory
Loaded validation loader into memory


In [None]:
# Model variables definition.
dropout_rate = 0.40
model = model_module.SimplifiedLightweightCNN(input_channels=1, num_classes=1, dropout_rate=dropout_rate)
print(model.eval())
name = "StageI-SimplifiedLightweightCNN"
lr = 1e-4  # Reduce from 1e-3
epochs = 50
0
model = model.to(device)
# weight_decay = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Add L2 regularization
# Add learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

criterion = nn.BCEWithLogitsLoss()

# Start a new wandb run to track this script.
run = wandb.init(
    # name of the run
    name=name,
    config={
        "Name": name,
        "learning_rate": lr,
        "optimizer": "Adam",
        "criterion": "BCELoss",
        "dataset": "BothStages-only-polish",
        "train_val_test(%)": f'{train_split}-{val_split}-{test_split}',
        "epochs": epochs,
        "classifier_dropout_rate": dropout_rate
    },
)

# Training loop
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    # Update learning rate
    scheduler.step(val_loss)
    # Logging the metadata for each epoch so that the charts can be generated on the dashboard
    run.log({"train_acc": train_acc, "train_loss": train_loss, "val_acc": val_acc, "val_loss": val_loss, })
    print(
        f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f},"
        f"Train acc: {train_acc * 100:.2f}%, Val Acc: {val_acc * 100:.2f}%")

run.log({"model_eval": model.eval()})
# Saving the model to pth and adding it to the artifacts of the run, there is 5GB of memory on wandb, so we should be fine.
torch.save(model.state_dict(), os.path.join(RESULT_DIRECTORY, f'{name}.pth'))
artifact = wandb.Artifact(name, type="model")
artifact.add_file(os.path.join(RESULT_DIRECTORY, f'{name}.pth'))
run.log_artifact(artifact)

# Finish the run so it gets sent to the remote. You can discover the run right after that on the dashboard.
run.finish()


SimplifiedLightweightCNN(
  (block1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(5, 3), stride=(1, 1), padding=(2, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): SiLU()
    (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (7): Dropout2d(p=0.1, inplace=False)
  )
  (block2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): SiLU()
    (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dil

In [11]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
# model_saved = LightweightCNN()
# model_saved.load_state_dict(torch.load(os.path.join(RESULT_DIRECTORY, "initial_model_t&val_080101split.pth")))
# model_saved.to(device)
# model_saved.eval()

In [12]:
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

Test Loss: 0.6589, Test Acc: 0.6142


In [13]:
df_other_lang_loader = DataLoader(dataset_other, batch_size=16,
                                  shuffle=False, collate_fn=collate_fn, num_workers=4)
df_other_lang_loader = MemoryLoadedDataLoader(df_other_lang_loader, device=device)
print("df_other_lang_loader into memory")

df_other_lang_loader into memory


In [14]:
other_lang_loss, other_lang_acc = evaluate(model, df_other_lang_loader, criterion, device)
print(f"Other mother language Loss: {other_lang_loss:.4f}, Other mother language Acc: {other_lang_acc:.4f}")

Other mother language Loss: 0.6213, Other mother language Acc: 0.6866
