In [1]:
import os
from data.source.pg_experiment import get_pg_experiment_dataset, AUDIO_PATH
import polars as pl
from models.SimpleCNN import PronunciationDataset, collate_fn, SimpleCNN, train, evaluate
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from path import RESULT_DIRECTORY

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
df_pron, _ = get_pg_experiment_dataset(".wav")

df_stageI_polish = df_pron.filter(pl.col("word_id").str.starts_with("a") & pl.col("mother")
    .str.strip_chars()
    .str.to_lowercase()
    .is_in(["polish", "polski"]))

df_other_lang = df_pron.filter(
    pl.col("word_id").str.starts_with("a") &
    (~pl.col("mother")
       .str.strip_chars()
       .str.to_lowercase()
       .is_in(["polish", "polski"]))
)


In [3]:
count_missing = 0
rec_paths = df_other_lang.select("rec_path").to_series()

for path in rec_paths:
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        count_missing += 1


count_missing_pl = 0
rec_paths = df_stageI_polish.select("rec_path").to_series()

for path in rec_paths:
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        count_missing_pl += 1

print(f"Missing files in mother language Polish: {count_missing_pl}")
print(f"Missing files in mother language other than Polish: {count_missing}")

Missing files in mother language Polish: 0
Missing files in mother language other than Polish: 2


In [4]:
# Find indices of rows with missing files
rec_paths = df_other_lang.select("rec_path").to_series()
missing_indices = []

for i, path in enumerate(rec_paths):
    full_path = os.path.normpath(os.path.join(AUDIO_PATH, path))
    if not os.path.exists(full_path):
        missing_indices.append(i)

# Print the deleted rows
deleted_rows = df_other_lang[missing_indices]
print(deleted_rows)

df_other_lang = df_other_lang.filter(
    ~pl.Series(range(len(df_other_lang))).is_in(pl.Series(missing_indices).implode())
)



shape: (2, 7)
┌────────────┬───────┬─────────┬───────────────────┬───────────┬────────┬─────────┐
│ id_student ┆ value ┆ word_id ┆ rec_path          ┆ univ      ┆ gender ┆ mother  │
│ ---        ┆ ---   ┆ ---     ┆ ---               ┆ ---       ┆ ---    ┆ ---     │
│ i64        ┆ i64   ┆ str     ┆ str               ┆ str       ┆ str    ┆ str     │
╞════════════╪═══════╪═════════╪═══════════════════╪═══════════╪════════╪═════════╡
│ 757        ┆ 1     ┆ a0      ┆ stageI/757/a0.wav ┆ CLES_UMK2 ┆ f      ┆ chinese │
│ 757        ┆ 1     ┆ a1      ┆ stageI/757/a1.wav ┆ CLES_UMK2 ┆ f      ┆ chinese │
└────────────┴───────┴─────────┴───────────────────┴───────────┴────────┴─────────┘


In [5]:
import polars as pl
import numpy as np

def stratified_split(df: pl.DataFrame, label_col: str, train_frac=0.8, val_frac=0.1, seed=42):
    classes = df.select(label_col).unique().to_series()
    train_rows, val_rows, test_rows = [], [], []

    rng = np.random.RandomState(seed)

    for cls in classes:
        class_df = df.filter(pl.col(label_col) == cls)
        n = class_df.height
        indices = rng.permutation(n)

        train_end = int(train_frac * n)
        val_end = int((train_frac + val_frac) * n)

        train_rows.append(class_df[indices[:train_end]])
        val_rows.append(class_df[indices[train_end:val_end]])
        test_rows.append(class_df[indices[val_end:]])

    train_df = pl.concat(train_rows)
    val_df = pl.concat(val_rows)
    test_df = pl.concat(test_rows)

    return train_df, val_df, test_df


In [7]:
# dataset = PronunciationDataset(df_stageI_polish, base_dir=AUDIO_PATH)
dataset_other = PronunciationDataset(df_other_lang ,base_dir=AUDIO_PATH)

In [6]:
train_pl, val_pl, test_pl = stratified_split(df_stageI_polish, label_col="value")

dataset_train = PronunciationDataset(train_pl, base_dir=AUDIO_PATH)
dataset_val = PronunciationDataset(val_pl, base_dir=AUDIO_PATH)
dataset_test = PronunciationDataset(test_pl, base_dir=AUDIO_PATH)


In [None]:
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(dataset_val, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False, collate_fn=collate_fn)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleCNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

epochs = 10
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Batch 287/287, Loss: 0.7358, Accuracy: 0.3333
Epoch 1, Train Loss: 0.6726, Train Acc: 0.5832, Val Loss: 0.6575, Val Acc: 0.5881
Batch 287/287, Loss: 0.7032, Accuracy: 0.5556
Epoch 2, Train Loss: 0.6504, Train Acc: 0.6120, Val Loss: 0.6448, Val Acc: 0.6143
Batch 287/287, Loss: 0.6656, Accuracy: 0.5556
Epoch 3, Train Loss: 0.6288, Train Acc: 0.6375, Val Loss: 0.6255, Val Acc: 0.6195
Batch 287/287, Loss: 0.5408, Accuracy: 0.8889
Epoch 4, Train Loss: 0.6137, Train Acc: 0.6510, Val Loss: 0.6220, Val Acc: 0.6230
Batch 287/287, Loss: 0.5648, Accuracy: 0.6667
Epoch 5, Train Loss: 0.5935, Train Acc: 0.6759, Val Loss: 0.6095, Val Acc: 0.6387
Batch 287/287, Loss: 0.5465, Accuracy: 0.8889
Epoch 6, Train Loss: 0.5750, Train Acc: 0.6942, Val Loss: 0.6049, Val Acc: 0.6527
Batch 287/287, Loss: 0.4045, Accuracy: 0.7778
Epoch 7, Train Loss: 0.5462, Train Acc: 0.7095, Val Loss: 0.6077, Val Acc: 0.6736
Batch 287/287, Loss: 0.8588, Accuracy: 0.3333
Epoch 8, Train Loss: 0.5146, Train Acc: 0.7348, Val Loss: 

In [None]:
torch.save(model.state_dict(), os.path.join(RESULT_DIRECTORY, "initial_model_t&val_080101split.pth"))

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_saved = SimpleCNN()
model_saved.load_state_dict(torch.load(os.path.join(RESULT_DIRECTORY, "initial_model_t&val_080101split.pth")))
model_saved.to(device)
model_saved.eval()

SimpleCNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=16384, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
from torch.utils.data import DataLoader

# Create DataLoader for other dataset (batch_size=1 for per-sample inference)
other_loader = DataLoader(dataset_other, batch_size=1, shuffle=False, collate_fn=collate_fn)

model_saved .eval()
with torch.no_grad():
    for i, (spec, label) in enumerate(other_loader):
        if i == 10:
            break
        spec = spec.to(device)
        output = model_saved(spec)
        pred = (output > 0.5).float()
        print(f"Sample {i+1}: Prediction={pred.item()}, Ground Truth={label.item()}")


Sample 1: Prediction=1.0, Ground Truth=1.0
Sample 2: Prediction=0.0, Ground Truth=0.0
Sample 3: Prediction=0.0, Ground Truth=0.0
Sample 4: Prediction=1.0, Ground Truth=0.0
Sample 5: Prediction=0.0, Ground Truth=1.0
Sample 6: Prediction=1.0, Ground Truth=1.0
Sample 7: Prediction=0.0, Ground Truth=0.0
Sample 8: Prediction=1.0, Ground Truth=1.0
Sample 9: Prediction=0.0, Ground Truth=0.0
Sample 10: Prediction=0.0, Ground Truth=1.0
