In [1]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, random_split, DataLoader, Subset
import string
import re
import os
import torch.nn.utils.rnn as rnn_utils
from transformers.feature_extraction_utils import BatchFeature
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
!pip install python-Levenshtein
import Levenshtein
from collections import defaultdict

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collecte

In [2]:
df = pd.read_csv('/kaggle/input/mlpr-data/torgo_vectors_transcripts.csv')
df['FeaturePath'] = df['FeaturePath'].str.replace("E:\\MLPR Data\\Features\\", "/kaggle/input/mlpr-data/Features/Features/")

# Create mask to filter out entries with 'input' and 'jpg'
mask1 = ~(df['transcipt'].str.contains('input', case=False, na=False) & 
         df['transcipt'].str.contains('jpg', case=False, na=False))

# Create mask to filter out entries with 'say' and 'repeatedly'
mask2 = ~(df['transcipt'].str.contains('say', case=False, na=False) & 
         df['transcipt'].str.contains('repeatedly', case=False, na=False))

# Combine both masks
mask = mask1 & mask2

df = df[mask]
speakers = df["Speaker"].unique() 
df.to_csv('mlpr-torgo-kaggle.csv', index=False)

In [3]:
class TorgoASRDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        feature_path = row["FeaturePath"]
        transcript = row["transcipt"]
        speaker = row["Speaker"]
    
        try:
            features = torch.load(feature_path, map_location='cpu')
        except Exception as e:
            raise RuntimeError(f"Failed to load feature from '{feature_path}': {e}")
        
        if isinstance(features, dict):
            input_values = features.get("input_values")
            if input_values is None:
                raise ValueError(f"'input_values' key not found in features loaded from {feature_path}")
        elif hasattr(features, "input_values"):
            input_values = features.input_values
        else:
            input_values = features
    
        if not isinstance(input_values, torch.Tensor):
            input_values = torch.tensor(input_values)
    
        if input_values.dim() == 3:
            input_values = input_values.squeeze(0)  # now shape is (T, hidden_size)
    
        seq_length = input_values.size(0)
        
        return {
            "input_values": input_values,
            "seq_length": seq_length,
            "transcript": transcript,
            "speaker": speaker
        }

In [4]:
def collate_fn(batch):
    input_values_list = []
    seq_lengths = []
    transcripts = []
    speakers = []
    
    for sample in batch:
        
        x = sample["input_values"]
        sample_seq_length = x.size(0)
        
        input_values_list.append(x)
        seq_lengths.append(sample_seq_length)
        transcripts.append(sample["transcript"])
        speakers.append(sample["speaker"])
    

    padded_inputs = torch.nn.utils.rnn.pad_sequence(input_values_list, batch_first=True, padding_value=0)
    
    
    padded_inputs = padded_inputs.contiguous()
    
    return {
        "input_values": padded_inputs,  # Now shape: (batch, time, hidden_size)
        "seq_lengths": torch.tensor(seq_lengths),
        "transcripts": transcripts,
        "speakers": speakers
    }


def transcript_to_indices(transcript, char_to_idx):
    return [char_to_idx[char] for char in transcript if char in char_to_idx]

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=2, dropout_rate=0.3):
        super(Encoder, self).__init__()
        
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU()
        )
        
        # Bidirectional LSTM layers with layer normalization
        self.lstm_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        
        for i in range(num_layers):
            input_size = hidden_dim if i == 0 else hidden_dim * 2
            self.lstm_layers.append(nn.LSTM(
                input_size=input_size,
                hidden_size=hidden_dim,
                batch_first=True,
                bidirectional=True
            ))
            self.layer_norms.append(nn.LayerNorm(hidden_dim * 2))
        
        self.dropout = nn.Dropout(dropout_rate)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
    def forward(self, x, input_lengths=None):
        # Project input to hidden dimension
        if x.dim() == 3 and x.size(2) == 1:  
            x = x.squeeze(2)
            x = x.unsqueeze(2)
        
        x = self.input_projection(x)
        
        # Process through LSTM layers with layer normalization and residual connections
        for i, (lstm, norm) in enumerate(zip(self.lstm_layers, self.layer_norms)):
            residual = x if i > 0 and x.size(-1) == hidden_dim * 2 else None
            
            # Use packed sequence for variable-length inputs
            if input_lengths is not None and i == 0:
                packed_x = rnn_utils.pack_padded_sequence(x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
                packed_output, (h, c) = lstm(packed_x)
                lstm_out, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)
            else:
                lstm_out, (h, c) = lstm(x)
            
            lstm_out = self.dropout(lstm_out)
            lstm_out = norm(lstm_out)
            
            # Add residual connection if dimensions match
            if residual is not None:
                lstm_out = lstm_out + residual
            
            x = lstm_out
            
            # Save final states for last layer
            if i == self.num_layers - 1:
                final_h, final_c = h, c
        
        # Process final states for decoder initialization
        forward_h = final_h[0].unsqueeze(0)   # [1, batch_size, hidden_dim]
        backward_h = final_h[1].unsqueeze(0)  # [1, batch_size, hidden_dim]
        forward_c = final_c[0].unsqueeze(0)   # [1, batch_size, hidden_dim]
        backward_c = final_c[1].unsqueeze(0)  # [1, batch_size, hidden_dim]
        
        # Concatenate bidirectional states
        h_concat = torch.cat([forward_h, backward_h], dim=2)  # [1, batch_size, hidden_dim*2]
        c_concat = torch.cat([forward_c, backward_c], dim=2)  # [1, batch_size, hidden_dim*2]
        
        return x, (h_concat, c_concat)  # encoder_outputs, (hidden, cell)


class BahdanauAttention(nn.Module):
    """Implements Bahdanau (additive) attention as described in the paper:"""
    """'Neural Machine Translation by Jointly Learning to Align and Translate' (2015)"""
    
    def __init__(self, encoder_dim, decoder_dim):
        super(BahdanauAttention, self).__init__()
        # Alignment model (Bahdanau's additive attention)
        self.W_a = nn.Linear(encoder_dim, decoder_dim, bias=False)
        self.U_a = nn.Linear(decoder_dim, decoder_dim, bias=False)
        self.v_a = nn.Linear(decoder_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs, mask=None):
        """Calculate context vector using Bahdanau attention mechanism
        
        Args:
            hidden: decoder's hidden state [1, batch_size, decoder_dim]
            encoder_outputs: outputs from encoder [batch_size, src_len, encoder_dim]
            mask: mask for padded elements in encoder_outputs, if any
            
        Returns:
            context: context vector [batch_size, encoder_dim]
            attention_weights: attention weights [batch_size, src_len]
        """
        batch_size = encoder_outputs.size(0)
        src_len = encoder_outputs.size(1)
        
        # Reshape decoder hidden state to match batch dimension
        hidden = hidden.transpose(0, 1).contiguous()  # [batch_size, 1, decoder_dim]
        hidden_expanded = hidden.repeat(1, src_len, 1)  # [batch_size, src_len, decoder_dim]
        
        # Calculate alignment scores
        # First transform encoder outputs with W_a
        encoder_transform = self.W_a(encoder_outputs)  # [batch_size, src_len, decoder_dim]
        
        # Then transform hidden state with U_a
        hidden_transform = self.U_a(hidden.squeeze(1)).unsqueeze(1)  # [batch_size, 1, decoder_dim]
        
        # Combine transforms and apply tanh
        # score = tanh(W_a*h_enc + U_a*h_dec)
        energy = torch.tanh(encoder_transform + hidden_transform)  # [batch_size, src_len, decoder_dim]
        
        # Apply v_a to get scalar scores
        energy = self.v_a(energy).squeeze(2)  # [batch_size, src_len]
        
        # Apply mask if provided (for padding)
        if mask is not None:
            energy.masked_fill_(mask == 0, -1e10)
        
        # Apply softmax to get attention weights
        attention_weights = torch.softmax(energy, dim=1)  # [batch_size, src_len]
        
        # Apply attention weights to get context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # [batch_size, 1, encoder_dim]
        context = context.squeeze(1)  # [batch_size, encoder_dim]
        
        return context, attention_weights


class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encoder_dim, hidden_dim, dropout_rate=0.3):
        super(Decoder, self).__init__()
        
        # Character embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize decoder state with encoder final state
        self.init_h = nn.Linear(encoder_dim, hidden_dim)
        self.init_c = nn.Linear(encoder_dim, hidden_dim)
        
        # Bahdanau attention mechanism
        self.attention = BahdanauAttention(encoder_dim, hidden_dim)
        
        # LSTM for sequence processing
        self.lstm = nn.LSTM(
            input_size=embedding_dim + encoder_dim,  # Input: embedding + context
            hidden_size=hidden_dim,
            batch_first=True
        )
        
        # Regularization
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        
        # Output projection with residual connection
        self.character_projection = nn.Sequential(
            nn.Linear(hidden_dim + encoder_dim + embedding_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, vocab_size)
        )
        
        self.hidden_dim = hidden_dim
        self.encoder_dim = encoder_dim
        
    def init_hidden(self, encoder_hidden, encoder_cell):
        """Initialize decoder state based on encoder final state"""
        # encoder_hidden/cell shape: [1, batch_size, encoder_dim]
        h = self.init_h(encoder_hidden)
        c = self.init_c(encoder_cell)
        return h, c
        
    def forward(self, input_char, hidden, cell, encoder_outputs, encoder_mask=None):
        """Single step of decoder with Bahdanau attention"""
        # Embed current character
        embedded = self.embedding(input_char).unsqueeze(1)  # [batch_size, 1, embedding_dim]
        
        # Calculate attention and context vector
        context, attn_weights = self.attention(hidden, encoder_outputs, encoder_mask)
        context = context.unsqueeze(1)  # [batch_size, 1, encoder_dim]
        
        # Concatenate embedding and context for LSTM input
        lstm_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, embedding_dim + encoder_dim]
        
        # Process through LSTM
        lstm_out, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.layer_norm(lstm_out)
        
        # Prepare output projection input by concatenating all available information
        concat_input = torch.cat((lstm_out.squeeze(1), context.squeeze(1), embedded.squeeze(1)), dim=1)
        output = self.character_projection(concat_input)
        
        return output, hidden, cell, attn_weights


class EncoderDecoderModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size, sos_idx=0, eos_idx=0, pad_idx=0,
                 embedding_dim=128, encoder_layers=2, dropout_rate=0.3, max_decoding_length=150, device='cpu'):
        super(EncoderDecoderModel, self).__init__()
        
        # Encoder with multiple BiLSTM layers
        self.encoder = Encoder(input_dim, hidden_dim, num_layers=encoder_layers, dropout_rate=dropout_rate)
        
        # Decoder with Bahdanau attention
        encoder_output_dim = hidden_dim * 2  # BiLSTM outputs twice the hidden dim
        self.decoder = Decoder(vocab_size, embedding_dim, encoder_output_dim, hidden_dim, dropout_rate)
        
        # Special tokens
        self.sos_idx = sos_idx  # Start of sequence token
        self.eos_idx = eos_idx  # End of sequence token
        self.pad_idx = pad_idx  # Padding token
        
        self.vocab_size = vocab_size
        self.device = device
        self.hidden_dim = hidden_dim
        self.max_decoding_length = max_decoding_length
        
    def forward(self, x, target_sequence=None, teacher_forcing_ratio=0.5, max_length=100):
        """Forward pass with teacher forcing for training"""
        batch_size = x.size(0)
        
        # Encode input sequence
        encoder_outputs, (encoder_hidden, encoder_cell) = self.encoder(x)
        
        # Create encoder mask for attention (1 for real tokens, 0 for padding)
        encoder_mask = (torch.sum(encoder_outputs, dim=2) != 0).float().unsqueeze(1)
        
        # Initialize decoder state
        hidden, cell = self.decoder.init_hidden(
            encoder_hidden.transpose(0, 1).contiguous().transpose(1, 2).contiguous().transpose(1, 0),
            encoder_cell.transpose(0, 1).contiguous().transpose(1, 2).contiguous().transpose(1, 0)
        )
        
        # Set maximum decoding length
        if target_sequence is not None:
            max_length = min(target_sequence.size(1), self.max_decoding_length)
        else:
            max_length = min(self.max_decoding_length, max_length)
        
        # Initial decoder input: start token
        decoder_input = torch.ones(batch_size, dtype=torch.long).to(self.device) * self.sos_idx
        
        # Tensors to store outputs and attention
        outputs = torch.zeros(batch_size, max_length, self.vocab_size).to(self.device)
        attentions = torch.zeros(batch_size, max_length, encoder_outputs.size(1)).to(self.device)
        
        # Autoregressive decoding with teacher forcing
        for t in range(max_length):
            output, hidden, cell, attn_weights = self.decoder(
                decoder_input, hidden, cell, encoder_outputs, encoder_mask
            )
            
            # Store output and attention
            outputs[:, t] = output
            attentions[:, t] = attn_weights
            
            # Teacher forcing: use ground truth vs. previous prediction
            if target_sequence is not None and t < max_length - 1 and torch.rand(1).item() < teacher_forcing_ratio:
                # Use ground truth for next input
                decoder_input = target_sequence[:, t]
            else:
                # Use model prediction for next input
                top1 = output.argmax(1)
                decoder_input = top1
        
        return outputs, attentions
    
    def decode(self, x, max_length=None):
        """Generate text prediction using greedy decoding"""
        batch_size = x.size(0)
        if max_length is None:
            max_length = self.max_decoding_length
        else:
            max_length = min(self.max_decoding_length, max_length)
        
        # Encode input sequence
        encoder_outputs, (encoder_hidden, encoder_cell) = self.encoder(x)
        
        # Create encoder mask for attention
        encoder_mask = (torch.sum(encoder_outputs, dim=2) != 0).float().unsqueeze(1)
        
        # Initialize decoder state
        hidden, cell = self.decoder.init_hidden(
            encoder_hidden.transpose(0, 1).contiguous().transpose(1, 2).contiguous().transpose(1, 0),
            encoder_cell.transpose(0, 1).contiguous().transpose(1, 2).contiguous().transpose(1, 0)
        )
        
        # Initial decoder input: start token
        decoder_input = torch.ones(batch_size, dtype=torch.long).to(self.device) * self.sos_idx
        
        # Lists to store predictions
        all_predictions = []
        has_ended = torch.zeros(batch_size, dtype=torch.bool).to(self.device)
        
        # Autoregressive greedy decoding
        for t in range(max_length):
            output, hidden, cell, _ = self.decoder(
                decoder_input, hidden, cell, encoder_outputs, encoder_mask
            )
            
            # Greedy selection of most likely character
            predictions = output.argmax(1)
            all_predictions.append(predictions.unsqueeze(1))
            
            # Check for end of sequence token
            has_ended = has_ended | (predictions == self.eos_idx)
            if has_ended.all():
                break
                
            # Use current prediction as next input
            decoder_input = predictions
        
        # Concatenate all predictions
        predictions = torch.cat(all_predictions, dim=1)  # [batch_size, seq_len]
        
        return predictions

In [None]:
def calculate_cer(reference, prediction):
    distance = Levenshtein.distance(reference, prediction)
    return distance / max(len(reference), 1)

def calculate_wer(reference, prediction):
    ref_words = reference.split()
    pred_words = prediction.split()
    distance = Levenshtein.distance(ref_words, pred_words)
    return distance / max(len(ref_words), 1)

def preprocess_targets(transcripts, char_to_idx):
    """Convert text transcripts to index sequences and pad them"""
    target_seqs = [torch.tensor([char_to_idx.get(c, 0) for c in t if c in char_to_idx], dtype=torch.long) 
                  for t in transcripts]
    
    # Make sure all sequences have at least one token
    target_seqs = [t if len(t) > 0 else torch.tensor([0], dtype=torch.long) for t in target_seqs]
    
    # Pad sequences
    padded_targets = rnn_utils.pad_sequence(target_seqs, batch_first=True, padding_value=0)
    return padded_targets

def trainModel(model, train_loader, val_loader, char_to_idx, num_epochs=10, learning_rate=1e-4, patience=3, min_delta=0.001, teacher_forcing_ratio=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model.to(device)
    model.device = device  # Update model's device attribute
    
    # Use cross entropy loss (handles the logits and target indices)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

    best_val_loss = float('inf')
    epochs_without_improvement = 0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        
        for batch in train_pbar:
            inputs = batch["input_values"].to(device)
            transcripts = batch["transcripts"]
            
            # Preprocess target sequences
            target_seqs = preprocess_targets(transcripts, char_to_idx).to(device)
            
            optimizer.zero_grad()
            # Forward pass with teacher forcing
            outputs, _ = model(inputs, target_seqs, teacher_forcing_ratio=teacher_forcing_ratio)
            
            # Reshape outputs for CrossEntropyLoss
            batch_size, seq_len, vocab_size = outputs.size()
            outputs_flat = outputs.view(-1, vocab_size)
            targets_flat = target_seqs.view(-1)
            
            # Calculate loss
            loss = criterion(outputs_flat, targets_flat)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            
            optimizer.step()
            running_loss += loss.item()
            train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})
            
        avg_train_loss = running_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        
        with torch.no_grad():
            for batch in val_pbar:
                inputs = batch["input_values"].to(device)
                transcripts = batch["transcripts"]
                
                # Preprocess target sequences
                target_seqs = preprocess_targets(transcripts, char_to_idx).to(device)
                
                # Forward pass without teacher forcing
                outputs, _ = model(inputs, target_seqs, teacher_forcing_ratio=0.0)
                
                # Reshape outputs for CrossEntropyLoss
                batch_size, seq_len, vocab_size = outputs.size()
                outputs_flat = outputs.view(-1, vocab_size)
                targets_flat = target_seqs.view(-1)
                
                # Calculate loss
                loss = criterion(outputs_flat, targets_flat)
                val_loss += loss.item()
                val_pbar.set_postfix({"val_loss": f"{loss.item():.4f}"})
                
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
        
        scheduler.step(avg_val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Current learning rate: {current_lr:.6f}")
        
        if avg_val_loss < best_val_loss - min_delta:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            best_model_state = model.state_dict().copy()
            print(f"New best validation loss: {best_val_loss:.4f}")
        else:
            epochs_without_improvement += 1
            print(f"No improvement for {epochs_without_improvement} epochs")
            
        if epochs_without_improvement >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            model.load_state_dict(best_model_state)
            break
    
    if best_model_state is not None and epochs_without_improvement < patience:
        model.load_state_dict(best_model_state)
        
    print("Training complete.")
    return best_val_loss


def evaluateModel(model, test_loader, char_to_idx, idx_to_char, output_csv="evaluation_results.csv"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.device = device  # Update model's device attribute
    model.eval()
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
    test_loss = 0.0
    all_predictions = []
    all_transcripts = []
    all_speakers = []
    
    with torch.no_grad():
        test_pbar = tqdm(test_loader, desc="Evaluating")
        for batch in test_pbar:
            inputs = batch["input_values"].to(device)
            transcripts = batch["transcripts"]
            speakers = batch["speakers"]
            
            # Get target sequences for loss calculation
            target_seqs = preprocess_targets(transcripts, char_to_idx).to(device)
            
            # Generate predictions with improved decoder (limit to 50 chars to prevent long sequences)
            predictions = model.decode(inputs, max_length=50)
            
            # Calculate loss with teacher forcing outputs
            outputs, _ = model(inputs, target_seqs, teacher_forcing_ratio=0.0)
            outputs_flat = outputs.view(-1, model.vocab_size)
            targets_flat = target_seqs.view(-1)
            loss = criterion(outputs_flat, targets_flat)
            test_loss += loss.item()
            
            # Convert prediction indices to text
            batch_texts = []
            for pred in predictions:
                text = ''.join([idx_to_char.get(p.item(), '') for p in pred if p.item() > 0])
                # Simple truncation at first double space if it exists
                if '  ' in text:
                    text = text.split('  ')[0]
                batch_texts.append(text)
            
            all_predictions.extend(batch_texts)
            all_transcripts.extend(transcripts)
            all_speakers.extend(speakers)
            
            test_pbar.set_postfix({"loss": f"{loss.item():.4f}"})
            
        avg_test_loss = test_loss / len(test_loader)
        print(f"Final Test Loss: {avg_test_loss:.4f}")
        
        # Calculate overall WER and CER
        total_cer = 0.0
        total_wer = 0.0
        for ref, pred in zip(all_transcripts, all_predictions):
            total_cer += calculate_cer(ref, pred)
            total_wer += calculate_wer(ref, pred)
        
        avg_cer = total_cer / len(all_predictions)
        avg_wer = total_wer / len(all_predictions)
        print(f"Overall Character Error Rate: {avg_cer:.4f}")
        print(f"Overall Word Error Rate: {avg_wer:.4f}")
        
        # Calculate per-speaker metrics
        speaker_predictions = defaultdict(list)
        speaker_references = defaultdict(list)
        
        for speaker, ref, pred in zip(all_speakers, all_transcripts, all_predictions):
            speaker_predictions[speaker].append(pred)
            speaker_references[speaker].append(ref)
        
        print("\nPer-Speaker Metrics:")
        for speaker in sorted(speaker_predictions.keys()):
            preds = speaker_predictions[speaker]
            refs = speaker_references[speaker]
            
            speaker_cer = sum(calculate_cer(r, p) for r, p in zip(refs, preds)) / len(preds)
            speaker_wer = sum(calculate_wer(r, p) for r, p in zip(refs, preds)) / len(preds)
            
            print(f"Speaker {speaker} (samples: {len(preds)})")
            print(f"  - Character Error Rate: {speaker_cer:.4f}")
            print(f"  - Word Error Rate: {speaker_wer:.4f}")
        
        # Print examples
        for i in range(min(15, len(all_predictions))):
            print(f"Example {i+1} (Speaker: {all_speakers[i]}):\nReference: '{all_transcripts[i]}'\nPrediction: '{all_predictions[i]}'")
        
        # Save results to CSV file
        results_df = pd.DataFrame({
            'speaker': all_speakers,
            'reference': all_transcripts,
            'prediction': all_predictions,
            'cer': [calculate_cer(ref, pred) for ref, pred in zip(all_transcripts, all_predictions)],
            'wer': [calculate_wer(ref, pred) for ref, pred in zip(all_transcripts, all_predictions)]
        })
        
        results_df.to_csv(output_csv, index=False)
        print(f"\nEvaluation results saved to {output_csv}")
        
        return avg_test_loss, all_predictions, all_transcripts

In [None]:
vocab = "abcdefghijklmnopqrstuvwxyz "
char_to_idx = {char: i+1 for i, char in enumerate(vocab)}
idx_to_char = {i+1: char for i, char in enumerate(vocab)}
vocab_size = len(vocab) + 1  # +1 for padding/blank token

csv_file = "/kaggle/working/mlpr-torgo-kaggle.csv"
full_dataset = TorgoASRDataset(csv_file)

train_idx, test_idx = train_test_split(
    range(len(full_dataset)), 
    test_size=0.2, 
    random_state=42 
)

train_idx, val_idx = train_test_split(
    train_idx, 
    test_size=0.25,  
    random_state=42
)

train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)
test_dataset = Subset(full_dataset, test_idx)

# Smaller batch size to prevent out of memory errors
train_loader = DataLoader(train_dataset, batch_size=24, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=24, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False, collate_fn=collate_fn)

# Model parameters
input_dim = 1024
hidden_dim = 256
embedding_dim = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create improved encoder-decoder model with Bahdanau attention
model = EncoderDecoderModel(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    encoder_layers=2,
    dropout_rate=0.3,
    max_decoding_length=50,  # Limit sequence length
    device=device
)

# Train the model
best_val_loss = trainModel(
    model,
    train_loader,
    val_loader,
    char_to_idx,
    num_epochs=30,
    learning_rate=5e-4,
    patience=5,
    min_delta=0.01,
    teacher_forcing_ratio=0.5
)

print(f"Best validation loss: {best_val_loss:.4f}")
print("Evaluating best model on test set:")
test_loss, predictions, references = evaluateModel(model, test_loader, char_to_idx, idx_to_char)

# Display examples
print("\nExamples with improved Bahdanau attention:")
for i, (ref, pred) in enumerate(zip(references[:10], predictions[:10])):
    print(f"Example {i+1}:")
    print(f"Reference: '{ref}'")
    print(f"Prediction: '{pred}'")
    print()

Using device: cuda


  features = torch.load(feature_path, map_location='cpu')
Epoch 1/30 [Train]: 100%|██████████| 81/81 [01:14<00:00,  1.09it/s, loss=3.4227]
Epoch 1/30 [Val]: 100%|██████████| 27/27 [00:17<00:00,  1.54it/s, val_loss=3.2810]


Epoch [1/30] Training Loss: 4.0460, Validation Loss: 3.3246
Current learning rate: 0.000500
New best validation loss: 3.3246


Epoch 2/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=2.1067]
Epoch 2/30 [Val]: 100%|██████████| 27/27 [00:08<00:00,  3.36it/s, val_loss=1.4831]


Epoch [2/30] Training Loss: 2.9373, Validation Loss: 1.6671
Current learning rate: 0.000500
New best validation loss: 1.6671


Epoch 3/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.95it/s, loss=1.3185]
Epoch 3/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.39it/s, val_loss=1.1698]


Epoch [3/30] Training Loss: 1.4258, Validation Loss: 1.3569
Current learning rate: 0.000500
New best validation loss: 1.3569


Epoch 4/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.97it/s, loss=1.1889]
Epoch 4/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.43it/s, val_loss=1.0578]


Epoch [4/30] Training Loss: 1.2476, Validation Loss: 1.2822
Current learning rate: 0.000500
New best validation loss: 1.2822


Epoch 5/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=1.3773]
Epoch 5/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.49it/s, val_loss=1.0783]


Epoch [5/30] Training Loss: 1.1790, Validation Loss: 1.2759
Current learning rate: 0.000500
No improvement for 1 epochs


Epoch 6/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  1.98it/s, loss=1.0186]
Epoch 6/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.46it/s, val_loss=1.0925]


Epoch [6/30] Training Loss: 1.1209, Validation Loss: 1.2970
Current learning rate: 0.000500
No improvement for 2 epochs


Epoch 7/30 [Train]: 100%|██████████| 81/81 [00:39<00:00,  2.03it/s, loss=1.3075]
Epoch 7/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.48it/s, val_loss=1.0583]


Epoch [7/30] Training Loss: 1.0674, Validation Loss: 1.2332
Current learning rate: 0.000500
New best validation loss: 1.2332


Epoch 8/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.96it/s, loss=1.2491]
Epoch 8/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.45it/s, val_loss=1.0402]


Epoch [8/30] Training Loss: 1.0282, Validation Loss: 1.2714
Current learning rate: 0.000500
No improvement for 1 epochs


Epoch 9/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.02it/s, loss=1.1432]
Epoch 9/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.43it/s, val_loss=1.0340]


Epoch [9/30] Training Loss: 0.9884, Validation Loss: 1.2270
Current learning rate: 0.000500
No improvement for 2 epochs


Epoch 10/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=0.9953]
Epoch 10/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.45it/s, val_loss=1.0567]


Epoch [10/30] Training Loss: 0.9366, Validation Loss: 1.2470
Current learning rate: 0.000500
No improvement for 3 epochs


Epoch 11/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.95it/s, loss=0.9240]
Epoch 11/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.46it/s, val_loss=1.0127]


Epoch [11/30] Training Loss: 0.8919, Validation Loss: 1.2363
Current learning rate: 0.000500
No improvement for 4 epochs


Epoch 12/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=1.0032]
Epoch 12/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.48it/s, val_loss=1.0229]


Epoch [12/30] Training Loss: 0.8589, Validation Loss: 1.1458
Current learning rate: 0.000500
New best validation loss: 1.1458


Epoch 13/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.97it/s, loss=0.7446]
Epoch 13/30 [Val]: 100%|██████████| 27/27 [00:08<00:00,  3.36it/s, val_loss=1.0261]


Epoch [13/30] Training Loss: 0.8329, Validation Loss: 1.2075
Current learning rate: 0.000500
No improvement for 1 epochs


Epoch 14/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=0.6829]
Epoch 14/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.38it/s, val_loss=1.0128]


Epoch [14/30] Training Loss: 0.7985, Validation Loss: 1.2032
Current learning rate: 0.000500
No improvement for 2 epochs


Epoch 15/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.01it/s, loss=0.7834]
Epoch 15/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.48it/s, val_loss=1.0915]


Epoch [15/30] Training Loss: 0.7456, Validation Loss: 1.2325
Current learning rate: 0.000250
No improvement for 3 epochs


Epoch 16/30 [Train]: 100%|██████████| 81/81 [00:41<00:00,  1.97it/s, loss=0.7626]
Epoch 16/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.49it/s, val_loss=1.0686]


Epoch [16/30] Training Loss: 0.6627, Validation Loss: 1.2288
Current learning rate: 0.000250
No improvement for 4 epochs


Epoch 17/30 [Train]: 100%|██████████| 81/81 [00:40<00:00,  2.00it/s, loss=0.3935]
Epoch 17/30 [Val]: 100%|██████████| 27/27 [00:07<00:00,  3.50it/s, val_loss=1.0733]


Epoch [17/30] Training Loss: 0.6217, Validation Loss: 1.2215
Current learning rate: 0.000250
No improvement for 5 epochs
Early stopping after 17 epochs
Training complete.
Best validation loss: 1.1458
Evaluating best model on test set:


Evaluating: 100%|██████████| 27/27 [00:28<00:00,  1.06s/it, loss=1.0921]

Final Test Loss: 1.1111
Overall Character Error Rate: 0.2820
Overall Word Error Rate: 0.5125

Per-Speaker Metrics:
Speaker F01 (samples: 22)
  - Character Error Rate: 0.5377
  - Word Error Rate: 0.8422
Speaker F03 (samples: 128)
  - Character Error Rate: 0.4369
  - Word Error Rate: 0.7304
Speaker F04 (samples: 89)
  - Character Error Rate: 0.2215
  - Word Error Rate: 0.4067
Speaker FC01 (samples: 42)
  - Character Error Rate: 0.3103
  - Word Error Rate: 0.6110
Speaker FC02 (samples: 244)
  - Character Error Rate: 0.1946
  - Word Error Rate: 0.4468
Speaker FC03 (samples: 192)
  - Character Error Rate: 0.2699
  - Word Error Rate: 0.4902
Speaker M01 (samples: 13)
  - Character Error Rate: 0.6692
  - Word Error Rate: 0.9049
Speaker M02 (samples: 79)
  - Character Error Rate: 0.6209
  - Word Error Rate: 0.9908
Speaker M03 (samples: 96)
  - Character Error Rate: 0.1470
  - Word Error Rate: 0.2822
Speaker M04 (samples: 89)
  - Character Error Rate: 0.6290
  - Word Error Rate: 0.9480
Speaker M


