<a href="https://colab.research.google.com/github/syedmahmoodiagents/Speech/blob/main/Speech_CNNvsRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchcodec

Collecting torchcodec
  Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m76.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.1


In [None]:
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
import torch.nn as nn

In [None]:
class SimpleASRDataset(Dataset):
    def __init__(self, files, texts, vocab):
        self.files = files
        self.texts = texts
        self.vocab = vocab
        self.char_to_idx = {c:i+1 for i,c in enumerate(vocab)}  # 0=blank for CTC
        self.idx_to_char = {i+1:c for i,c in enumerate(vocab)}

        self.mfcc = torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=40)
        self.max_len = 16000  # 1 sec clips for simplicity

    def encode_text(self, text):
        return torch.tensor([self.char_to_idx[c] for c in text])

    def __getitem__(self, idx):
        path = self.files[idx]
        text = self.texts[idx]

        audio, sr = torchaudio.load(path)

        if audio.shape[0] > 1:
            audio = audio.mean(dim=0, keepdim=True)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)

        if audio.shape[1] < self.max_len:
            audio = F.pad(audio, (0, self.max_len - audio.shape[1]))
        else:
            audio = audio[:, :self.max_len]

        mfcc = self.mfcc(audio).squeeze(0)   # [40, T]
        mfcc = mfcc.transpose(0,1)           # [T, 40]

        target = self.encode_text(text)

        return mfcc, target

    def __len__(self):
        return len(self.files)


In [None]:
def collate_fn(batch):
    mfccs = [b[0] for b in batch]
    targets = [b[1] for b in batch]

    mfcc_lens = torch.tensor([x.shape[0] for x in mfccs])
    tgt_lens   = torch.tensor([t.shape[0] for t in targets])

    mfccs_padded = torch.nn.utils.rnn.pad_sequence(mfccs, batch_first=True)
    targets_padded = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)

    return mfccs_padded, mfcc_lens, targets_padded, tgt_lens

# Using Convolution

In [None]:
class CNN_ASR(nn.Module):
    def __init__(self, n_mels=40, num_classes=30):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((1,2))  # reduce time dimension
        )

        # Corrected: n_mels dimension is halved after MaxPool2d
        self.fc = nn.Linear(64*(n_mels // 2), num_classes)

    def forward(self, x):
        # x: [B, T, 40]
        x = x.unsqueeze(1)          # [B, 1, T, 40]
        x = self.cnn(x)             # [B, 64, T_new, 20] (where 20 is n_mels // 2)
        x = x.permute(0, 2, 1, 3)   # [B, T_new, 64, 20]
        x = x.reshape(x.size(0), x.size(1), -1)  # flatten freq to [B, T_new, 64*20]
        x = self.fc(x)              # [B, T_new, num_classes]
        return x

# Using RNN

In [None]:
class RNN_ASR(nn.Module):
    def __init__(self, n_mels=40, hidden=256, num_classes=30):
        super().__init__()

        self.lstm = nn.LSTM(input_size=n_mels, hidden_size=hidden, num_layers=3, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden*2, num_classes)

    def forward(self, x):
        # x: [B, T, 40]
        out, _ = self.lstm(x)
        out = self.fc(out)    # [B, T, num_classes]
        return out


In [None]:
import torch.optim as optim

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def train_model(model, name, loader, epochs=10):
    model = model.to(device)
    ctc = nn.CTCLoss(blank=0, zero_infinity=True)
    opt = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for mfcc, mfcc_lens, tgt, tgt_lens in loader:
            mfcc = mfcc.to(device)
            tgt = tgt.to(device)

            logits = model(mfcc)                 # [B, T, C]

            # CTC expects shape [T, B, C]
            logits = logits.transpose(0,1)
            loss = ctc(logits,tgt,mfcc_lens,tgt_lens)

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss.item()

        print(f"{name} Epoch {epoch+1}: Loss = {total_loss:.4f}")


In [None]:
texts = ["hello", "yes", "no", "open the door"]

# build character-level vocabulary
vocab = sorted(list({c for t in texts for c in t}))
# vocab = sorted([t for t in texts])
print("Vocab:", vocab)
print("Vocab size:", len(vocab))


Vocab: [' ', 'd', 'e', 'h', 'l', 'n', 'o', 'p', 'r', 's', 't', 'y']
Vocab size: 12


In [None]:
import numpy as np
import scipy.io.wavfile as wavfile

In [None]:
# Create a dummy .wav file for testing
dummy_audio_path = "dummy_audio.wav"
sample_rate = 16000
duration = 1  # 1 second
dummy_audio = np.random.uniform(low=-0.5, high=0.5, size=sample_rate * duration).astype(np.float32)
##################################
wavfile.write(dummy_audio_path, sample_rate, dummy_audio)

In [None]:
files = [dummy_audio_path] * len(texts)

In [None]:
dataset = SimpleASRDataset(files, texts, vocab)
loader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)




In [None]:
cnn_model = CNN_ASR(num_classes=len(vocab)+1)
rnn_model = RNN_ASR(num_classes=len(vocab)+1)

In [None]:
train_model(cnn_model, "CNN", loader)
print("-----------")
train_model(rnn_model, "RNN", loader)

CNN Epoch 1: Loss = 44.6848
CNN Epoch 2: Loss = -54.4574
CNN Epoch 3: Loss = 13.2778
CNN Epoch 4: Loss = 65.9578
CNN Epoch 5: Loss = 93.1612
CNN Epoch 6: Loss = 73.9377
CNN Epoch 7: Loss = 37.0053
CNN Epoch 8: Loss = -1.4626
CNN Epoch 9: Loss = -12.8997
CNN Epoch 10: Loss = -2.1957
-----------
RNN Epoch 1: Loss = -3.2533
RNN Epoch 2: Loss = 11.7204
RNN Epoch 3: Loss = 10.4251
RNN Epoch 4: Loss = 4.9397
RNN Epoch 5: Loss = -0.5747
RNN Epoch 6: Loss = -2.7585
RNN Epoch 7: Loss = -1.4666
RNN Epoch 8: Loss = 1.5214
RNN Epoch 9: Loss = 4.7736
RNN Epoch 10: Loss = 6.6024
