In [1]:
from google.colab import drive
drive.mount('/DataSet/drive')

Mounted at /DataSet/drive


In [3]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.3


In [7]:
import os
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import jiwer

def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.squeeze().numpy()

def read_text(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

def evaluate_asr_and_calculate_cer(audio_dir, text_dir, model_name='facebook/wav2vec2-large-960h'):
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    model.eval()

    cer_scores = []
    total_samples = 0

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith('.wav'):
            audio_path = os.path.join(audio_dir, audio_file)
            text_file = audio_file.replace('.wav', '.txt')
            text_path = os.path.join(text_dir, text_file)

            if os.path.exists(text_path):
                # Load and preprocess the audio
                input_audio = load_audio(audio_path)
                input_values = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_values

                # Perform ASR
                with torch.no_grad():
                    logits = model(input_values).logits
                predicted_ids = torch.argmax(logits, dim=-1)
                transcription = processor.batch_decode(predicted_ids)[0]

                # Load reference transcription
                reference = read_text(text_path)

                # Check if reference and transcription are not empty
                if reference and transcription:
                    # Compute CER
                    cer = jiwer.cer(reference, transcription)
                    cer_scores.append(cer)
                    total_samples += 1

                    print(f"[INFO] Processing file: {audio_file}")
                    print(f"    Reference    : {reference}")
                    print(f"    Transcription: {transcription}")
                    print(f"    CER          : {cer:.4f}")
                    print("    -----------")
                else:
                    print(f"[INFO] Skipping {audio_file}: Transcription is empty.")
                    print("    -----------")

    if total_samples > 0:
        average_cer = sum(cer_scores) / total_samples
        print(f"Average CER: {average_cer:.4f}")
    else:
        print("No valid samples found to compute CER.")

audio_directory = '/DataSet/drive/MyDrive/nptel-pure/wav'
text_directory = '/DataSet/drive/MyDrive/nptel-pure/corrected_txt'
evaluate_asr_and_calculate_cer(audio_directory, text_directory)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

[INFO] Processing file: 00003cb6eb91e90adcb398c71dd3c41032035506ec1f5e0bd7848da5.wav
    Reference    : IS NOTHING BUT YOUR AREA OF CROSS SECTION DIVIDED BY THE TOP WIDTH
    Transcription: IS NOTHING WORT YOR ADIO CROSSECTION DIVIDED WISE THE TOPPED WIT
    CER          : 0.3030
    -----------
[INFO] Processing file: 00003068300b3a77cfdd0208addd752f26c0c084963a1cdc0b4459ec.wav
    Reference    : DENSITY PROFILE AND THIS HAS BEEN FOUND TO BE VERY SUITABLE FOR GUIDING THE LASER SO
    Transcription: DENSELY PROFILE AND THIS HAS BEEN FOUND TO BE VERY SUITABLE FOR GUIDING THE LAZER SO
    CER          : 0.0357
    -----------
[INFO] Processing file: 0000381573d407fe83934438efd5d2c0727766e4e14a79eca4aeb7f6.wav
    Reference    : X NONNEGATIVE AND A IS A M BY N MATRIX AND RANK OF A IS EQUAL
    Transcription: EXTNONIGAT YOU AND AZA EMBINMATRIX AN RANK OF AS EQUAL
    CER          : 0.3279
    -----------
[INFO] Processing file: 0000376fe5fbeaa1b14731b758f1faa230d8a0229e3fe8c15d4ce958.wav
 