<a href="https://colab.research.google.com/github/sumits234/Automatic-Speech-Recognition/blob/main/Automatic_Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Cell 1: Mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!pip install --upgrade transformers


Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)
Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.1
    Uninstalling transformers-4.40.1:
      Successfully uninstalled transformers-4.40.1
Successfully installed tokenizers-0.21.1 transformers-4.52.4


In [None]:

# We'll use: torchaudio, datasets, transformers, evaluate, soundfile, fastapi, uvicorn

!pip install --quiet torchaudio datasets transformers evaluate soundfile fastapi uvicorn


In [None]:
#  Define Paths & Imports
import os
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,

)
from datasets import load_metric, Dataset, Audio
import numpy as np
import random
import evaluate

In [None]:
# === Cell 3: Extract both train-clean-100 and test-clean ===

# 3.1 Ensure target local folder exists
!rm -rf /content/LibriSpeech           # Start fresh to avoid duplication
!mkdir -p /content/LibriSpeech

# 3.2 Extract train-clean-100.tar.gz
# This will extract /content/LibriSpeech/train-clean-100
!tar -xzf "/content/drive/MyDrive/LibriSpeech/train-clean-100.tar.gz" -C /content

# 3.3 Extract test-clean.tar.gz
# This will extract /content/LibriSpeech/test-clean
!tar -xzf "/content/drive/MyDrive/LibriSpeech/test-clean.tar.gz" -C /content

# 3.4 After extraction, everything should now be in:
# /content/LibriSpeech/train-clean-100/
# /content/LibriSpeech/test-clean/

# 3.5 Confirm folder structure
!echo "✅ Extraction complete. Contents of /content/LibriSpeech:"
!ls -l /content/LibriSpeech


✅ Extraction complete. Contents of /content/LibriSpeech:
total 916
-rw-r--r--   1 1000 1000 115746 Oct  3  2014 BOOKS.TXT
-rw-r--r--   1 1000 1000 671086 Aug 17  2014 CHAPTERS.TXT
-rw-r--r--   1 1000 1000    193 Aug 17  2014 LICENSE.TXT
-rw-r--r--   1 1000 1000   8039 Oct  3  2014 README.TXT
-rw-r--r--   1 1000 1000 125034 Aug 17  2014 SPEAKERS.TXT
drwxr-xr-x  42 1000 1000   4096 Aug 16  2014 test-clean
drwxr-xr-x 253 1000 1000   4096 Aug 16  2014 train-clean-100


In [None]:
# === Cell 4: Confirm the LOCAL folders and set paths for the rest of the notebook ===
import os

LIBRISPEECH_ROOT = "/content/LibriSpeech"
TRAIN_SUBSET     = os.path.join(LIBRISPEECH_ROOT, "train-clean-100")
TEST_SUBSET      = os.path.join(LIBRISPEECH_ROOT, "test-clean")

print("Contents of /content/LibriSpeech:\n", os.listdir(LIBRISPEECH_ROOT), "\n")
print("TRAIN_SUBSET =", TRAIN_SUBSET, "→", "" if os.path.isdir(TRAIN_SUBSET) else "NOT FOUND")
print("TEST_SUBSET  =", TEST_SUBSET,  "→", "" if os.path.isdir(TEST_SUBSET)  else " NOT FOUND")



Contents of /content/LibriSpeech:
 ['CHAPTERS.TXT', 'LICENSE.TXT', 'BOOKS.TXT', 'SPEAKERS.TXT', 'train-clean-100', 'test-clean', 'README.TXT'] 

TRAIN_SUBSET = /content/LibriSpeech/train-clean-100 → 
TEST_SUBSET  = /content/LibriSpeech/test-clean → 


In [None]:
# Seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("Ready to proceed with Dataset exploration and model training.")

Ready to proceed with Dataset exploration and model training.


In [None]:
# == — Basic Exploration of train-clean-100 & test-clean ===
import soundfile as sf
from pathlib import Path
from torchaudio.datasets import LIBRISPEECH

def gather_stats(path):
    n_files = 0
    total_duration = 0.0
    for flac in Path(path).rglob("*.flac"):
        n_files += 1
        info = sf.info(str(flac))
        total_duration += info.frames / info.samplerate
    return n_files, total_duration

# The on-disk structure is /content/LibriSpeech/train-clean-100 and /content/LibriSpeech/test-clean
TRAIN_SUBSET = "/content/LibriSpeech/train-clean-100"
TEST_SUBSET  = "/content/LibriSpeech/test-clean"

# 1. Count files & total duration
train_n, train_dur = gather_stats(TRAIN_SUBSET)
test_n,  test_dur  = gather_stats(TEST_SUBSET)

print(f"Train-clean-100: {train_n} audio files, ≈{train_dur/3600:.2f} hours")
print(f"Test-clean:      {test_n} audio files, ≈{test_dur/3600:.2f} hours")

# 2. Sample transcript lengths from train-clean-100 via torchaudio’s LIBRISPEECH loader
#    Here we set root="/content" so that LIBRISPEECH can find "/content/LibriSpeech/train-clean-100".
loader_train = LIBRISPEECH(root="/content", url="train-clean-100", download=False)
lengths = []
for i, (_, _, transcript, *_ ) in enumerate(loader_train):
    if i >= 100:
        break
    lengths.append(len(transcript.split()))

import numpy as _np
print(f"Sample transcripts (n=100): avg words={_np.mean(lengths):.1f}, min={_np.min(lengths)}, max={_np.max(lengths)}")


Train-clean-100: 28539 audio files, ≈100.59 hours
Test-clean:      2620 audio files, ≈5.40 hours
Sample transcripts (n=100): avg words=48.1, min=15, max=68


In [None]:
# === Cell 7: Task 2 — Load Pretrained Wav2Vec2 & Compute Baseline WER ===
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Check for the test-clean directory
!ls "/content/LibriSpeech"


BOOKS.TXT     LICENSE.TXT  SPEAKERS.TXT  train-clean-100
CHAPTERS.TXT  README.TXT   test-clean


In [None]:
LIBRISPEECH_ROOT = "/content"
TEST_SUBSET      = "/content/LibriSpeech/test-clean"


In [None]:
!ls "/content/LibriSpeech/test-clean/0121/121726"


ls: cannot access '/content/LibriSpeech/test-clean/0121/121726': No such file or directory


In [None]:
# List the top‐level inside test-clean
!ls "/content/LibriSpeech/test-clean"

# Then, for example, list the contents of speaker “121”:
!ls "/content/LibriSpeech/test-clean/121"


1089  1284  2094  2830	3729  4970  5639  6829	7176  8455
1188  1320  2300  2961	4077  4992  5683  6930	7729  8463
121   1580  237   3570	4446  5105  61	  7021	8224  8555
1221  1995  260   3575	4507  5142  672   7127	8230  908
121726	123852	123859	127105


In [None]:
!pip install jiwer



In [None]:
import pandas as pd

# Baseline WER on test-clean  ===
LIBRISPEECH_ROOT = "/content"
TEST_SUBSET      = "/content/LibriSpeech/test-clean"

# 2) Load pretrained model + processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print("Loaded pretrained Wav2Vec2 model and processor.\n")

# 3) Build a pandas DataFrame for test-clean
test_rows = []
loader_test = LIBRISPEECH(root=LIBRISPEECH_ROOT, url="test-clean", download=False)

for waveform, sr, transcript, speaker_id, chapter_id, utterance_id in loader_test:
    # ──► Use speaker_id and chapter_id as plain ints (no zero-padding in folder names)
    spk_folder = str(speaker_id)
    chp_folder = str(chapter_id)
    # ──► Zero-pad only the utterance_id to 4 digits for the filename
    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
    flac_path = os.path.join(TEST_SUBSET, spk_folder, chp_folder, filename)
    test_rows.append({"audio_path": flac_path, "transcript": transcript})

test_df = pd.DataFrame(test_rows)
dataset_test = Dataset.from_pandas(test_df)
dataset_test = dataset_test.cast_column("audio_path", Audio(sampling_rate=16000))
dataset_test = dataset_test.rename_column("audio_path", "audio")
dataset_test = dataset_test.rename_column("transcript", "text")
print(f" Built dataset_test with {len(dataset_test)} utterances.\n")

# 4) Preprocessing function: waveform → input_values; transcript → labels
def prepare_batch(batch):
    audio = batch["audio"]["array"]
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch

dataset_test = dataset_test.map(
    prepare_batch,
    remove_columns=["audio", "text"],
    num_proc=4
)
print(" Preprocessing complete (mapped input_values + labels).\n")

# 5) Run inference & compute WER in batches (with padding)
wer_metric = evaluate.load("wer")
predictions = []
references = []
batch_size = 8

for i in range(0, len(dataset_test), batch_size):
    batch = dataset_test[i : i + batch_size]

    # Pad input_values correctly
    input_features = [{"input_values": x} for x in batch["input_values"]]
    padded_inputs = processor.feature_extractor.pad(
        input_features,
        padding=True,
        return_tensors="pt"
    )
    input_values = padded_inputs["input_values"].to(model.device)

    with torch.no_grad():
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    preds = processor.batch_decode(pred_ids)

    # Decode references (labels)
    label_ids = batch["labels"]
    references_batch = processor.batch_decode(label_ids, group_tokens=False)

    predictions.extend(preds)
    references.extend(references_batch)

# Final WER score
baseline_wer = wer_metric.compute(predictions=predictions, references=references)
print(f"\n Baseline WER (pretrained {model_name}): {baseline_wer:.3f}\n")




Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded pretrained Wav2Vec2 model and processor.

 Built dataset_test with 2620 utterances.



Map (num_proc=4):   0%|          | 0/2620 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


 Preprocessing complete (mapped input_values + labels).


 Baseline WER (pretrained facebook/wav2vec2-base-960h): 0.038



In [None]:
# Show a few examples
for idx in [15, 100, 1000]:
    print(f"— idx={idx}")
    print(f"  REF = {references[idx]}")
    print(f"  HYP = {predictions[idx]}\n")

— idx=15
  REF = BUT THE DUSK DEEPENING IN THE SCHOOLROOM COVERED OVER HIS THOUGHTS THE BELL RANG
  HYP = BUT THE DUSK DEEPENING IN THE SCHOOLROOM COVERED OVER HIS THOUGHTS THE BELL RANG

— idx=100
  REF = IN BOTH THESE HIGH MYTHICAL SUBJECTS THE SURROUNDING NATURE THOUGH SUFFERING IS STILL DIGNIFIED AND BEAUTIFUL
  HYP = IN BOTH THESE HIGH MYTHICAL SUBJECTS THE SURROUNDING NATURE THOSE SUFFERING IS STILL DIGNIFIED AND BEAUTIFUL

— idx=1000
  REF = OF THIS SECOND LETTER ALSO SHE SPOKE AND TOLD ME THAT IT CONTAINED AN INVITATION FOR HER TO GO AND SEE THE POET IF EVER SHE VISITED THE LAKES
  HYP = OF THIS SECOND LETTER ALSO SHE SPOKE AND TOLD ME THAT IT CONTAINED AN INVITATION FOR HER TO GO AND SEE THE POET IF EVER SHE VISITED THE LAKES



In [None]:
# === Cell H: Save Model and Processor for FastAPI Deployment ===

# Set the output path (can be your Drive or any local folder)
export_dir = "/content/drive/MyDrive/LibriSpeech/wav2vec2-fastapi"

# Save both processor and model
processor.save_pretrained(export_dir)
model.save_pretrained(export_dir)

print(f"✅ Model and processor saved to: {export_dir}")


✅ Model and processor saved to: /content/drive/MyDrive/LibriSpeech/wav2vec2-fastapi


In [None]:

# Paths
LIBRISPEECH_ROOT = "/content"
TRAIN_SUBSET = "/content/LibriSpeech/train-clean-100"
TEST_SUBSET = "/content/LibriSpeech/test-clean"
model_name = "facebook/wav2vec2-base-960h"

# Load processor and fresh model for fine-tuning
processor = Wav2Vec2Processor.from_pretrained(model_name)
model_ft = Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# === Prepare train-clean-100 Dataset ===
# Build DataFrame from torchaudio's loader
train_rows = []
loader_train = LIBRISPEECH(root=LIBRISPEECH_ROOT, url="train-clean-100", download=False)

for waveform, sr, transcript, speaker_id, chapter_id, utterance_id in loader_train:
    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
    flac_path = os.path.join(TRAIN_SUBSET, str(speaker_id), str(chapter_id), filename)
    train_rows.append({"audio_path": flac_path, "transcript": transcript})

train_df = pd.DataFrame(train_rows)
dataset_train = Dataset.from_pandas(train_df)
dataset_train = dataset_train.cast_column("audio_path", Audio(sampling_rate=16000))
dataset_train = dataset_train.rename_column("audio_path", "audio")
dataset_train = dataset_train.rename_column("transcript", "text")

# Same for test-clean (again, from scratch)
test_rows = []
loader_test = LIBRISPEECH(root=LIBRISPEECH_ROOT, url="test-clean", download=False)

for waveform, sr, transcript, speaker_id, chapter_id, utterance_id in loader_test:
    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
    flac_path = os.path.join(TEST_SUBSET, str(speaker_id), str(chapter_id), filename)
    test_rows.append({"audio_path": flac_path, "transcript": transcript})

test_df = pd.DataFrame(test_rows)
dataset_test = Dataset.from_pandas(test_df)
dataset_test = dataset_test.cast_column("audio_path", Audio(sampling_rate=16000))
dataset_test = dataset_test.rename_column("audio_path", "audio")
dataset_test = dataset_test.rename_column("transcript", "text")


In [None]:
# ===  RAM-Safe Preprocessing ===
# Only use first 2000 training samples for now

def prepare_batch(batch):
    audio = batch["audio"]["array"]
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch

# ✅ Select a small subset from train-clean-100
dataset_train = dataset_train.select(range(2000))   # safe size for Colab

# ✅ Keep full test-clean (small enough to fit)
dataset_train = dataset_train.map(prepare_batch, remove_columns=["audio", "text"])
dataset_test  = dataset_test.map(prepare_batch, remove_columns=["audio", "text"])

print("✅ Preprocessing complete (subset of train, full test)")


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

✅ Preprocessing complete (subset of train, full test)


In [None]:
from dataclasses import dataclass


In [None]:
# === Cell D: Data Collator (for padding) ===
from dataclasses import dataclass  # ✅ Required for @dataclass

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: bool = True

    def __call__(self, features):
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.tokenizer.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        # Replace padding tokens with -100
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch["attention_mask"].ne(1), -100
        )
        batch["labels"] = labels
        return batch

# ✅ Instantiate the collator
data_collator = DataCollatorCTCWithPadding(processor=processor)


In [None]:
# === Cell E: WER Evaluation Function ===
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {"wer": wer_metric.compute(predictions=pred_str, references=label_str)}


In [None]:
!pip install --upgrade transformers




In [None]:
from transformers import TrainingArguments


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./test",
    evaluation_strategy="epoch"
)
print("TrainingArguments works!")


In [None]:
# === Cell F: TrainingArguments and Trainer ===
from transformers import TrainingArguments, Trainer  #  Make sure this is imported

output_dir = "/content/drive/MyDrive/LibriSpeech/finetuned-wav2vec2"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",  #  Requires correct transformers version
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    learning_rate=1e-4,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
# === Cell G: Start Fine-Tuning ===
trainer.train()
trainer.save_model(output_dir)
print(f" Fine-tuned model saved to: {output_dir}")


In [None]:
!pip install fastapi uvicorn transformers torchaudio soundfile python-multipart




In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Assuming you already have 'model' and 'processor' loaded from training
export_dir = "/content/wav2vec2-fastapi"

#  Save processor and model (creates preprocessor_config.json too)
processor.save_pretrained(export_dir)
model.save_pretrained(export_dir)

#  Confirm files
import os
print("Saved files:", os.listdir(export_dir))
