In [1]:
import os
import glob
import random
import string
import math
import torch
import torchaudio
import pandas as pd
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
import tqdm
import pathlib
import shutil
from torch.utils.data import Dataset, random_split, DataLoader, Subset
from transformers.feature_extraction_utils import BatchFeature
import torch.optim as optim
from sklearn.model_selection import train_test_split 
import numpy as np
!pip install python-Levenshtein
import Levenshtein
from collections import defaultdict
import re

2025-05-13 14:30:57.039566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747146657.235632      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747146657.296080      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collecte

In [2]:
df1 = pd.read_csv('/kaggle/input/receptiondata/librispeech_manifest_with_feats.csv')
df2 = pd.read_csv('/kaggle/input/wave2vec-features-16001-to-24000/librispeech_manifest_with_feats.csv')
df3 = pd.read_csv('/kaggle/input/librifeatures1/librispeech_manifest_with_feats.csv')
df4 = pd.read_csv('/kaggle/input/librifeatures-24001-32000/librispeech_manifest_with_feats.csv')
df5 = pd.read_csv('/kaggle/input/librifeatures-32001-40000/librispeech_manifest_with_feats.csv')
df6 = pd.read_csv('/kaggle/input/librifeatures-40001-48000/librispeech_manifest_with_feats.csv')


df1['feature_path'] = df1['feature_path'].str.replace("/kaggle/working", "/kaggle/input/receptiondata")
df2['feature_path'] = df2['feature_path'].str.replace("/kaggle/working", "/kaggle/input/wave2vec-features-16001-to-24000")
df3['feature_path'] = df3['feature_path'].str.replace("/kaggle/working", "/kaggle/input/librifeatures1")
df4['feature_path'] = df4['feature_path'].str.replace("/kaggle/working", "/kaggle/input/librifeatures-24001-32000")
df5['feature_path'] = df5['feature_path'].str.replace("/kaggle/working", "/kaggle/input/librifeatures-32001-40000")
df6['feature_path'] = df6['feature_path'].str.replace("/kaggle/working", "/kaggle/input/librifeatures-40001-48000")

df = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

mask1 = ~(df['text'].str.contains('input', case=False, na=False) & 
         df['text'].str.contains('jpg', case=False, na=False))

mask2 = ~(df['text'].str.contains('say', case=False, na=False) & 
         df['text'].str.contains('repeatedly', case=False, na=False))

mask = mask1 & mask2

df = df[mask]

def normalize_transcript(text):
        
    text = text.lower()
    normalized = re.sub(r'[^a-z ]+', '', text)
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    return normalized

df['text'] = df['text'].apply(normalize_transcript)
df.to_csv('mlpr-libri-kaggle.csv', index=False)

In [3]:
class LibriASRDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        feature_path = row["feature_path"]
        transcript = row["text"]
        speaker = row["speaker_id"]
    
        try:
            features = torch.load(feature_path, map_location='cpu')
        except Exception as e:
            raise RuntimeError(f"Failed to load feature from '{feature_path}': {e}")
        
        if isinstance(features, dict):
            input_values = features.get("input_values")
            if input_values is None:
                raise ValueError(f"'input_values' key not found in features loaded from {feature_path}")
        elif hasattr(features, "input_values"):
            input_values = features.input_values
        else:
            input_values = features
    
        if not isinstance(input_values, torch.Tensor):
            input_values = torch.tensor(input_values)
    
        if input_values.dim() == 3:
            input_values = input_values.squeeze(0)  # now shape is (T, hidden_size)
    
        seq_length = input_values.size(0)
        
        return {
            "input_values": input_values,
            "seq_length": seq_length,
            "transcript": transcript,
            "speaker": speaker
        }

In [4]:
def collate_fn(batch):
    input_values_list = []
    seq_lengths = []
    transcripts = []
    speakers = []
    
    for sample in batch:
        
        x = sample["input_values"]
        sample_seq_length = x.size(0)
        
        input_values_list.append(x)
        seq_lengths.append(sample_seq_length)
        transcripts.append(sample["transcript"])
        speakers.append(sample["speaker"])
    

    padded_inputs = torch.nn.utils.rnn.pad_sequence(input_values_list, batch_first=True, padding_value=0)
    
    
    padded_inputs = padded_inputs.contiguous()
    
    return {
        "input_values": padded_inputs,  # Now shape: (batch, time, hidden_size)
        "seq_lengths": torch.tensor(seq_lengths),
        "transcripts": transcripts,
        "speakers": speakers
    }


def transcript_to_indices(transcript, char_to_idx):
    return [char_to_idx[char] for char in transcript if char in char_to_idx]

In [5]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.scale = torch.sqrt(torch.tensor(hidden_dim, dtype=torch.float32))
        
    def forward(self, x):
        # x shape: [batch_size, seq_len, hidden_dim]
        batch_size, seq_len, hidden_dim = x.size()
        
        # Compute query, key, value projections
        q = self.query(x)  # [batch_size, seq_len, hidden_dim]
        k = self.key(x)    # [batch_size, seq_len, hidden_dim]
        v = self.value(x)  # [batch_size, seq_len, hidden_dim]
        
        # Compute attention scores (scaled dot-product attention)
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale  # [batch_size, seq_len, seq_len]
        
        
        # Apply softmax to get attention weights
        attn_weights = torch.softmax(scores, dim=-1)  # [batch_size, seq_len, seq_len]
        
        # Apply attention weights to values
        context = torch.matmul(attn_weights, v)  # [batch_size, seq_len, hidden_dim]
        
        # Combine with residual connection
        output = context + x  # [batch_size, seq_len, hidden_dim]
        
        return output

class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size, num_layers=3, dropout_rate=0.3):
        super(Model, self).__init__()
        
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU()
        )
        
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(
                input_size=hidden_dim if i==0 else hidden_dim*2,
                hidden_size=hidden_dim,
                batch_first=True,
                bidirectional=True
            ) for i in range(num_layers)
        ])
        
        self.dropouts = nn.ModuleList([
            nn.Dropout(dropout_rate) for _ in range(num_layers)
        ])
        
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(hidden_dim*2) for _ in range(num_layers)
        ])
        
        self.attention = AttentionLayer(hidden_dim*2)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.Linear(hidden_dim, vocab_size)
        )
        
    def forward(self, x):
        if x.dim() == 3 and x.size(2) == 1:  
            x = x.squeeze(2)
            x = x.unsqueeze(2)
            
        batch_size, seq_len = x.size(0), x.size(1)
        

        x = self.input_projection(x)
        
        residual = None
        for i, (lstm, dropout, layer_norm) in enumerate(zip(self.lstm_layers, self.dropouts, self.layer_norms)):
            lstm_out, _ = lstm(x)
            lstm_out = dropout(lstm_out)
            
            if residual is not None and lstm_out.size() == residual.size():
                lstm_out = lstm_out + residual
                
            lstm_out = layer_norm(lstm_out)
            
            # Apply attention after the final LSTM layer
            if i == len(self.lstm_layers) - 1:
                lstm_out = self.attention(lstm_out)
                
            residual = lstm_out
            x = lstm_out
        
        logits = self.fc(x)
        
        logits = logits.transpose(0, 1)
        
        return logits
    
    def decode(self, x, seq_lengths):

        logits = self.forward(x) 
        

        predictions = torch.argmax(logits, dim=2)  
        predictions = predictions.transpose(0, 1)  
        
        return predictions

In [6]:
def calculate_cer(reference, prediction):
    distance = Levenshtein.distance(reference, prediction)
    return distance / max(len(reference), 1)

def calculate_wer(reference, prediction):
    ref_words = reference.split()
    pred_words = prediction.split()
    distance = Levenshtein.distance(ref_words, pred_words)
    return distance / max(len(ref_words), 1)

def trainModel(model, train_loader, val_loader, char_to_idx, num_epochs=10, learning_rate=1e-4, patience=3, min_delta=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model.to(device)
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

    best_val_loss = float('inf')
    epochs_without_improvement = 0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        
        for batch in train_pbar:
            inputs = batch["input_values"].to(device)
            input_lengths = batch["seq_lengths"].to(device)
            
            targets_list = [torch.tensor(transcript_to_indices(t, char_to_idx), dtype=torch.long)
                            for t in batch["transcripts"]]
            targets_list = [t if len(t) > 0 else torch.tensor([0], dtype=torch.long) for t in targets_list]
            
            targets = torch.cat(targets_list).to(device)
            target_lengths = torch.tensor([len(t) for t in targets_list], dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            log_probs = torch.nn.functional.log_softmax(outputs, dim=2)
            loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            
            optimizer.step()
            running_loss += loss.item()
            train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})
            
        avg_train_loss = running_loss / len(train_loader)
        
        model.eval()
        val_loss = 0.0
        val_pbar = tqdm.tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        
        with torch.no_grad():
            for batch in val_pbar:
                inputs = batch["input_values"].to(device)
                input_lengths = batch["seq_lengths"].to(device)
                
                targets_list = [torch.tensor(transcript_to_indices(t, char_to_idx), dtype=torch.long)
                                for t in batch["transcripts"]]
                targets_list = [t if len(t) > 0 else torch.tensor([0], dtype=torch.long) for t in targets_list]
                targets = torch.cat(targets_list).to(device)
                target_lengths = torch.tensor([len(t) for t in targets_list], dtype=torch.long).to(device)
                
                outputs = model(inputs)
                log_probs = torch.nn.functional.log_softmax(outputs, dim=2)
                loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
                val_loss += loss.item()
                val_pbar.set_postfix({"val_loss": f"{loss.item():.4f}"})
                
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
        
        scheduler.step(avg_val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Current learning rate: {current_lr:.6f}")
        
        if avg_val_loss < best_val_loss - min_delta:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict().copy()
            print(f"New best validation loss: {best_val_loss:.4f}")
        else:
            epochs_without_improvement += 1
            print(f"No improvement for {epochs_without_improvement} epochs")
            
        if epochs_without_improvement >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            model.load_state_dict(best_model_state)
            break
    
    if best_model_state is not None and epochs_without_improvement < patience:
        model.load_state_dict(best_model_state)
        
    print("Training complete.")
    return best_val_loss


def evaluateModel(model, test_loader, char_to_idx, idx_to_char, output_csv="evaluation_results.csv"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)
    test_loss = 0.0
    all_predictions = []
    all_transcripts = []
    all_speakers = []
    
    with torch.no_grad():
        test_pbar = tqdm.tqdm(test_loader, desc="Evaluating")
        for batch in test_pbar:
            inputs = batch["input_values"].to(device)
            input_lengths = batch["seq_lengths"].to(device)
            transcripts = batch["transcripts"]
            speakers = batch["speakers"]
            
            targets_list = [torch.tensor(transcript_to_indices(t, char_to_idx), dtype=torch.long)
                            for t in transcripts]
            targets_list = [t if len(t) > 0 else torch.tensor([0], dtype=torch.long) for t in targets_list]
            targets = torch.cat(targets_list).to(device)
            target_lengths = torch.tensor([len(t) for t in targets_list], dtype=torch.long).to(device)
            
            outputs = model(inputs)
            log_probs = torch.nn.functional.log_softmax(outputs, dim=2)
            loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
            test_loss += loss.item()
            
            predictions = torch.argmax(outputs, dim=2).transpose(0, 1)  # [batch, time]
            
            batch_texts = []
            for pred in predictions:
                pred_collapsed = []
                prev = None
                for p in pred:
                    if p.item() != prev:
                        pred_collapsed.append(p.item())
                        prev = p.item()
                
                text = ''.join([idx_to_char.get(p, '') for p in pred_collapsed if p > 0])
                batch_texts.append(text)
            
            all_predictions.extend(batch_texts)
            all_transcripts.extend(transcripts)
            all_speakers.extend(speakers)
            
            test_pbar.set_postfix({"loss": f"{loss.item():.4f}"})
            
        avg_test_loss = test_loss / len(test_loader)
        print(f"Final Test Loss: {avg_test_loss:.4f}")
        
        # Calculate overall WER and CER
        total_cer = 0.0
        total_wer = 0.0
        for ref, pred in zip(all_transcripts, all_predictions):
            total_cer += calculate_cer(ref, pred)
            total_wer += calculate_wer(ref, pred)
        
        avg_cer = total_cer / len(all_predictions)
        avg_wer = total_wer / len(all_predictions)
        print(f"Overall Character Error Rate: {avg_cer:.4f}")
        print(f"Overall Word Error Rate: {avg_wer:.4f}")
        
        # Calculate per-speaker metrics
        speaker_predictions = defaultdict(list)
        speaker_references = defaultdict(list)
        
        for speaker, ref, pred in zip(all_speakers, all_transcripts, all_predictions):
            speaker_predictions[speaker].append(pred)
            speaker_references[speaker].append(ref)
        
        print("\nPer-Speaker Metrics:")
        for speaker in sorted(speaker_predictions.keys()):
            preds = speaker_predictions[speaker]
            refs = speaker_references[speaker]
            
            speaker_cer = sum(calculate_cer(r, p) for r, p in zip(refs, preds)) / len(preds)
            speaker_wer = sum(calculate_wer(r, p) for r, p in zip(refs, preds)) / len(preds)
            
            print(f"Speaker {speaker} (samples: {len(preds)})")
            print(f"  - Character Error Rate: {speaker_cer:.4f}")
            print(f"  - Word Error Rate: {speaker_wer:.4f}")
        
        for i in range(min(15, len(all_predictions))):
            print(f"Example {i+1} (Speaker: {all_speakers[i]}):\nReference: '{all_transcripts[i]}'\nPrediction: '{all_predictions[i]}'")
        
        # Save results to CSV file
        results_df = pd.DataFrame({
            'speaker': all_speakers,
            'reference': all_transcripts,
            'prediction': all_predictions,
            'cer': [calculate_cer(ref, pred) for ref, pred in zip(all_transcripts, all_predictions)],
            'wer': [calculate_wer(ref, pred) for ref, pred in zip(all_transcripts, all_predictions)]
        })
        
        results_df.to_csv(output_csv, index=False)
        print(f"\nEvaluation results saved to {output_csv}")
        
        return avg_test_loss, all_predictions, all_transcripts

In [7]:
vocab = "abcdefghijklmnopqrstuvwxyz "
char_to_idx = {char: i+1 for i, char in enumerate(vocab)}
idx_to_char = {i+1: char for i, char in enumerate(vocab)}
vocab_size = len(vocab) + 1


csv_file = "/kaggle/working/mlpr-libri-kaggle.csv"
full_dataset = LibriASRDataset(csv_file)

train_idx, test_idx = train_test_split(
    range(len(full_dataset)), 
    test_size=0.2, 
    random_state=42 
)


train_idx, val_idx = train_test_split(
    train_idx, 
    test_size=0.25,  
    random_state=42
)

train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)
test_dataset = Subset(full_dataset, test_idx)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

input_dim = 1024
hidden_dim = 256
vocab_size = len(vocab) + 1

model = Model(input_dim=input_dim, hidden_dim=hidden_dim, vocab_size=vocab_size, num_layers=3, dropout_rate=0.3)

best_val_loss = trainModel(
    model, 
    train_loader, 
    val_loader, 
    char_to_idx, 
    num_epochs=30, 
    learning_rate=5e-4, 
    patience=5, 
    min_delta=0.01
)

print(f"Best validation loss: {best_val_loss:.4f}")
print("Evaluating best model on test set:")
test_loss, predictions, references = evaluateModel(model, test_loader, char_to_idx, idx_to_char)

Using device: cuda


  features = torch.load(feature_path, map_location='cpu')
Epoch 1/30 [Train]: 100%|██████████| 450/450 [19:34<00:00,  2.61s/it, loss=0.0199]
Epoch 1/30 [Val]: 100%|██████████| 150/150 [05:46<00:00,  2.31s/it, val_loss=0.0151]


Epoch [1/30] Training Loss: 0.9884, Validation Loss: 0.0139
Current learning rate: 0.000500
New best validation loss: 0.0139


Epoch 2/30 [Train]: 100%|██████████| 450/450 [17:52<00:00,  2.38s/it, loss=0.0110]
Epoch 2/30 [Val]: 100%|██████████| 150/150 [05:53<00:00,  2.35s/it, val_loss=0.0132]


Epoch [2/30] Training Loss: 0.0160, Validation Loss: 0.0117
Current learning rate: 0.000500
No improvement for 1 epochs


Epoch 3/30 [Train]: 100%|██████████| 450/450 [18:00<00:00,  2.40s/it, loss=0.0088]
Epoch 3/30 [Val]: 100%|██████████| 150/150 [05:42<00:00,  2.28s/it, val_loss=0.0123]


Epoch [3/30] Training Loss: 0.0142, Validation Loss: 0.0115
Current learning rate: 0.000500
No improvement for 2 epochs


Epoch 4/30 [Train]: 100%|██████████| 450/450 [19:27<00:00,  2.59s/it, loss=0.0223]
Epoch 4/30 [Val]: 100%|██████████| 150/150 [05:39<00:00,  2.27s/it, val_loss=0.0126]


Epoch [4/30] Training Loss: 0.0133, Validation Loss: 0.0114
Current learning rate: 0.000500
No improvement for 3 epochs


Epoch 5/30 [Train]: 100%|██████████| 450/450 [19:20<00:00,  2.58s/it, loss=0.0140]
Epoch 5/30 [Val]: 100%|██████████| 150/150 [06:20<00:00,  2.54s/it, val_loss=0.0112]


Epoch [5/30] Training Loss: 0.0128, Validation Loss: 0.0106
Current learning rate: 0.000500
No improvement for 4 epochs


Epoch 6/30 [Train]: 100%|██████████| 450/450 [17:50<00:00,  2.38s/it, loss=0.0131]
Epoch 6/30 [Val]: 100%|██████████| 150/150 [05:09<00:00,  2.07s/it, val_loss=0.0116]


Epoch [6/30] Training Loss: 0.0124, Validation Loss: 0.0112
Current learning rate: 0.000500
No improvement for 5 epochs
Early stopping after 6 epochs
Training complete.
Best validation loss: 0.0139
Evaluating best model on test set:


Evaluating: 100%|██████████| 150/150 [09:12<00:00,  3.69s/it, loss=0.0053]


Final Test Loss: 0.0120
Overall Character Error Rate: 0.0029
Overall Word Error Rate: 0.0130

Per-Speaker Metrics:
Speaker 17 (samples: 25)
  - Character Error Rate: 0.0004
  - Word Error Rate: 0.0021
Speaker 22 (samples: 24)
  - Character Error Rate: 0.0039
  - Word Error Rate: 0.0183
Speaker 23 (samples: 26)
  - Character Error Rate: 0.0022
  - Word Error Rate: 0.0084
Speaker 28 (samples: 9)
  - Character Error Rate: 0.0059
  - Word Error Rate: 0.0237
Speaker 38 (samples: 28)
  - Character Error Rate: 0.0013
  - Word Error Rate: 0.0065
Speaker 55 (samples: 24)
  - Character Error Rate: 0.0017
  - Word Error Rate: 0.0065
Speaker 64 (samples: 20)
  - Character Error Rate: 0.0015
  - Word Error Rate: 0.0053
Speaker 70 (samples: 21)
  - Character Error Rate: 0.0026
  - Word Error Rate: 0.0101
Speaker 79 (samples: 26)
  - Character Error Rate: 0.0043
  - Word Error Rate: 0.0236
Speaker 81 (samples: 25)
  - Character Error Rate: 0.0021
  - Word Error Rate: 0.0093
Speaker 98 (samples: 26)
 

In [8]:
model_save_path = "/kaggle/working/libri_asr_model.pt"

torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': vocab,
    'char_to_idx': char_to_idx,
    'idx_to_char': idx_to_char,
    'model_config': {
        'input_dim': input_dim,
        'hidden_dim': hidden_dim,
        'vocab_size': vocab_size,
        'num_layers': 3,
        'dropout_rate': 0.3
    }
}, model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to /kaggle/working/libri_asr_model.pt
